From 5b833825fd0012aec00f8fa5769d5400c18d59d8 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Wed, 11 Mar 2015 17:56:23 +0100 Subject: NFSv4.1: don't export static symbol The semantic patch that fixes this problem is as follows: (http://coccinelle.lip6.fr/) // @r@ type T; identifier f; @@ static T f (...) { ... } @@ identifier r.f; declarer name EXPORT_SYMBOL_GPL; @@ -EXPORT_SYMBOL_GPL(f); // Signed-off-by: Julia Lawall Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 4f802b02fbb9..0b2bca9ae281 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1902,7 +1902,6 @@ static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) pnfs_put_lseg(hdr->lseg); nfs_pgio_header_free(hdr); } -EXPORT_SYMBOL_GPL(pnfs_writehdr_free); int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) @@ -2032,7 +2031,6 @@ static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) pnfs_put_lseg(hdr->lseg); nfs_pgio_header_free(hdr); } -EXPORT_SYMBOL_GPL(pnfs_readhdr_free); int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) -- cgit v1.2.3 From df137bc125b6155e54e6725faf0462b791a34c31 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 11 Mar 2015 14:37:25 +0100 Subject: nfs: do not export discarded symbols The function nfs4_pnfs_v3_ds_connect_unload is exported so it can be used by other modules, but is also marked '__exit' and will be discarded when built into the kernel, as pointed out by this linker error: `nfs4_pnfs_v3_ds_connect_unload' referenced in section `___ksymtab_gpl+nfs4_pnfs_v3_ds_connect_unload' of fs/built-in.o: defined in discarded section `.exit.text' of fs/built-in.o This removes the __exit annotation to make it safe to call this function. Signed-off-by: Arnd Bergmann Fixes: 5f01d9539496 ("nfs41: create NFSv3 DS connection if specified") Signed-off-by: Trond Myklebust --- fs/nfs/pnfs_nfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 54e36b38fb5f..8582dae5ae99 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -561,7 +561,7 @@ static bool load_v3_ds_connect(void) return(get_v3_ds_connect != NULL); } -void __exit nfs4_pnfs_v3_ds_connect_unload(void) +void nfs4_pnfs_v3_ds_connect_unload(void) { if (get_v3_ds_connect) { symbol_put(nfs3_set_ds_client); -- cgit v1.2.3 From 09a330f4b9324e40947cc4fff13606719382c580 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Fri, 5 Dec 2014 21:52:49 -0500 Subject: NFS: remount with security change should return EINVAL A remount that alters security flavors can appear to succeed when it should instead return -EINVAL. Check to see if the current security flavor exists within the flavors specified in the remount options, and if not fail the remount. Signed-off-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 322b2de02988..54a079a465e1 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2193,7 +2193,7 @@ nfs_compare_remount_data(struct nfs_server *nfss, data->version != nfss->nfs_client->rpc_ops->version || data->minorversion != nfss->nfs_client->cl_minorversion || data->retrans != nfss->client->cl_timeout->to_retries || - data->selected_flavor != nfss->client->cl_auth->au_flavor || + !nfs_auth_info_match(&data->auth_info, nfss->client->cl_auth->au_flavor) || data->acregmin != nfss->acregmin / HZ || data->acregmax != nfss->acregmax / HZ || data->acdirmin != nfss->acdirmin / HZ || @@ -2241,7 +2241,6 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) data->wsize = nfss->wsize; data->retrans = nfss->client->cl_timeout->to_retries; data->selected_flavor = nfss->client->cl_auth->au_flavor; - data->auth_info = nfss->auth_info; data->acregmin = nfss->acregmin / HZ; data->acregmax = nfss->acregmax / HZ; data->acdirmin = nfss->acdirmin / HZ; -- cgit v1.2.3 From f0eede10fd401a1c5193e020cd372d7b7014d9f3 Mon Sep 17 00:00:00 2001 From: Nicholas Mc Guire Date: Tue, 3 Mar 2015 04:24:25 -0500 Subject: SUNRPC: use jiffies_to_msecs for converting jiffies Use jiffies_to_msecs for converting jiffies as it handles all of the corner cases reliably and also helps readability. Signed-off-by: Nicholas Mc Guire Signed-off-by: Trond Myklebust --- net/sunrpc/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index b91fd9c597b4..c234e7ff2bf0 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -90,7 +90,7 @@ __rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task) return; dprintk("RPC: %5u setting alarm for %lu ms\n", - task->tk_pid, task->tk_timeout * 1000 / HZ); + task->tk_pid, jiffies_to_msecs(task->tk_timeout)); task->u.tk_wait.expires = jiffies + task->tk_timeout; if (list_empty(&queue->timer_list.list) || time_before(task->u.tk_wait.expires, queue->timer_list.expires)) -- cgit v1.2.3 From 38942ba204639030b3f3cfe5748a6aff3c9dcc81 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 4 Mar 2015 15:59:05 -0500 Subject: NFSv4: Append delegations to the per-client list instead of prepending Do so on the assumption that for most use cases, that list will turn into a more or less LRU-ordered list, and so the list traversals in nfs_client_return_marked_delegations() are likely to be shorter before hitting a candidate to return. Signed-off-by: Trond Myklebust --- fs/nfs/delegation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index a6ad68865880..540ee260318c 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -378,7 +378,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct if (freeme == NULL) goto out; } - list_add_rcu(&delegation->super_list, &server->delegations); + list_add_tail_rcu(&delegation->super_list, &server->delegations); rcu_assign_pointer(nfsi->delegation, delegation); delegation = NULL; -- cgit v1.2.3 From 55cc1d7806cd4827d38a18ddf3bcf04706758218 Mon Sep 17 00:00:00 2001 From: Nicholas Mc Guire Date: Fri, 13 Mar 2015 04:25:34 -0400 Subject: SUNRPC: fix build-warning due to format missmatch fix build-warning introduced by commit: f0eede10fd4 ("SUNRPC: use jiffies_to_msecs for converting jiffies") which did not fixup the format properly (my bad). Signed-off-by: Nicholas Mc Guire Signed-off-by: Trond Myklebust --- net/sunrpc/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index c234e7ff2bf0..337ca851a350 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -89,8 +89,8 @@ __rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task) if (!task->tk_timeout) return; - dprintk("RPC: %5u setting alarm for %lu ms\n", - task->tk_pid, jiffies_to_msecs(task->tk_timeout)); + dprintk("RPC: %5u setting alarm for %u ms\n", + task->tk_pid, jiffies_to_msecs(task->tk_timeout)); task->u.tk_wait.expires = jiffies + task->tk_timeout; if (list_empty(&queue->timer_list.list) || time_before(task->u.tk_wait.expires, queue->timer_list.expires)) -- cgit v1.2.3 From 2854475f6c612d59901d51c358abd05643278b53 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Wed, 21 Jan 2015 00:20:11 +0800 Subject: nfs: clean up nfs_direct_IO This follows up "nfs: fix dio deadlock when O_DIRECT flag is flipped" and removes the unnecessary CONFIG_NFS_SWAP switch. Signed-off-by: Peng Tao Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index e907c8cf732e..51f617e45fd1 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -259,18 +259,11 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t if (!IS_SWAPFILE(inode)) return 0; -#ifndef CONFIG_NFS_SWAP - dprintk("NFS: nfs_direct_IO (%pD) off/no(%Ld/%lu) EINVAL\n", - iocb->ki_filp, (long long) pos, iter->nr_segs); - - return -EINVAL; -#else VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE); if (rw == READ) return nfs_file_direct_read(iocb, iter, pos); return nfs_file_direct_write(iocb, iter, pos); -#endif /* CONFIG_NFS_SWAP */ } static void nfs_direct_release_pages(struct page **pages, unsigned int npages) -- cgit v1.2.3 From 0695314ef0920b745423510f1e96bf60415a404a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 23 Mar 2015 16:10:00 -0400 Subject: SUNRPC: Fix a regression when reconnecting If the task needs to give up the socket lock in order to allow a reconnect to occur, then it must also clear the 'rq_bytes_sent' field so that when it retransmits, it knows to start from the beginning. Fixes: 718ba5b87343 ("SUNRPC: Add helpers to prevent socket create from racing") Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index e3015aede0d9..ed4aa786c240 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -326,6 +326,15 @@ out_unlock: xprt_clear_locked(xprt); } +static void xprt_task_clear_bytes_sent(struct rpc_task *task) +{ + if (task != NULL) { + struct rpc_rqst *req = task->tk_rqstp; + if (req != NULL) + req->rq_bytes_sent = 0; + } +} + /** * xprt_release_xprt - allow other requests to use a transport * @xprt: transport with other tasks potentially waiting @@ -336,11 +345,7 @@ out_unlock: void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task == task) { - if (task != NULL) { - struct rpc_rqst *req = task->tk_rqstp; - if (req != NULL) - req->rq_bytes_sent = 0; - } + xprt_task_clear_bytes_sent(task); xprt_clear_locked(xprt); __xprt_lock_write_next(xprt); } @@ -358,11 +363,7 @@ EXPORT_SYMBOL_GPL(xprt_release_xprt); void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task == task) { - if (task != NULL) { - struct rpc_rqst *req = task->tk_rqstp; - if (req != NULL) - req->rq_bytes_sent = 0; - } + xprt_task_clear_bytes_sent(task); xprt_clear_locked(xprt); __xprt_lock_write_next_cong(xprt); } @@ -700,6 +701,7 @@ bool xprt_lock_connect(struct rpc_xprt *xprt, goto out; if (xprt->snd_task != task) goto out; + xprt_task_clear_bytes_sent(task); xprt->snd_task = cookie; ret = true; out: -- cgit v1.2.3 From 5fcdfacc01f3a3415b75acb172cd75c9fe1cbda5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 25 Mar 2015 13:19:42 -0400 Subject: NFSv4: Return delegations synchronously in evict_inode Kinglong Mee reports that asynchronous delegations are being killed by the call to rpc_shutdown_client() when unmounting. This can lead to state leakage on the server until the client lease expires. Reported-by: Kinglong Mee Signed-off-by: Trond Myklebust --- fs/nfs/delegation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index a6ad68865880..08c624448750 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -514,7 +514,7 @@ void nfs_inode_return_delegation_noreclaim(struct inode *inode) delegation = nfs_inode_detach_delegation(inode); if (delegation != NULL) - nfs_do_return_delegation(inode, delegation, 0); + nfs_do_return_delegation(inode, delegation, 1); } /** -- cgit v1.2.3 From 84a80f62f71beac20a426709c04b49f2bd352291 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 9 Mar 2015 15:23:35 -0400 Subject: NFSv4.1: Convert pNFS deviceid to use kfree_rcu() Use of synchronize_rcu() when unmounting and potentially freeing a lot of deviceids is problematic. There really is no reason why we can't just use kfree_rcu() here. Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/dev.c | 2 +- fs/nfs/filelayout/filelayoutdev.c | 2 +- fs/nfs/flexfilelayout/flexfilelayoutdev.c | 2 +- fs/nfs/objlayout/objio_osd.c | 2 +- fs/nfs/pnfs.h | 1 + fs/nfs/pnfs_dev.c | 6 ++---- 6 files changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index 5aed4f98df41..e535599a0719 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -33,7 +33,7 @@ bl_free_deviceid_node(struct nfs4_deviceid_node *d) container_of(d, struct pnfs_block_dev, node); bl_free_device(dev); - kfree(dev); + kfree_rcu(dev, node.rcu); } static int diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index 4f372e224603..4946ef40ba87 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -55,7 +55,7 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) nfs4_pnfs_ds_put(ds); } kfree(dsaddr->stripe_indices); - kfree(dsaddr); + kfree_rcu(dsaddr, id_node.rcu); } /* Decode opaque device data and return the result */ diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index e2c01f204a95..77a2d026aa12 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -30,7 +30,7 @@ void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds) { nfs4_print_deviceid(&mirror_ds->id_node.deviceid); nfs4_pnfs_ds_put(mirror_ds->ds); - kfree(mirror_ds); + kfree_rcu(mirror_ds, id_node.rcu); } /* Decode opaque device data and construct new_ds using it */ diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 24e1d7403c0b..8b5e0e687d5e 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -57,7 +57,7 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d) dprintk("%s: free od=%p\n", __func__, de->od.od); osduld_put_device(de->od.od); - kfree(de); + kfree_rcu(d, rcu); } struct objio_segment { diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 635f0865671c..a1fc16c971a7 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -302,6 +302,7 @@ struct nfs4_deviceid_node { unsigned long flags; unsigned long timestamp_unavailable; struct nfs4_deviceid deviceid; + struct rcu_head rcu; atomic_t ref; }; diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index aa2ec0015183..bf23ac97d57d 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -175,8 +175,8 @@ __nfs4_find_get_deviceid(struct nfs_server *server, rcu_read_lock(); d = _lookup_deviceid(server->pnfs_curr_ld, server->nfs_client, id, hash); - if (d != NULL) - atomic_inc(&d->ref); + if (d != NULL && !atomic_inc_not_zero(&d->ref)) + d = NULL; rcu_read_unlock(); return d; } @@ -236,7 +236,6 @@ nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld, } hlist_del_init_rcu(&d->node); spin_unlock(&nfs4_deviceid_lock); - synchronize_rcu(); /* balance the initial ref set in pnfs_insert_deviceid */ if (atomic_dec_and_test(&d->ref)) @@ -321,7 +320,6 @@ _deviceid_purge_client(const struct nfs_client *clp, long hash) if (hlist_empty(&tmp)) return; - synchronize_rcu(); while (!hlist_empty(&tmp)) { d = hlist_entry(tmp.first, struct nfs4_deviceid_node, tmpnode); hlist_del(&d->tmpnode); -- cgit v1.2.3 From fb1458f4578c8dc78dc124de2a58950a6d4e3492 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 9 Mar 2015 15:37:00 -0400 Subject: NFSv4.1: Cleanup - don't opencode nfs4_put_deviceid_node() There really is no reason to do so. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs_dev.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index bf23ac97d57d..7e07f4ba4822 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -238,8 +238,7 @@ nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld, spin_unlock(&nfs4_deviceid_lock); /* balance the initial ref set in pnfs_insert_deviceid */ - if (atomic_dec_and_test(&d->ref)) - d->ld->free_deviceid_node(d); + nfs4_put_deviceid_node(d); } EXPORT_SYMBOL_GPL(nfs4_delete_deviceid); @@ -323,8 +322,7 @@ _deviceid_purge_client(const struct nfs_client *clp, long hash) while (!hlist_empty(&tmp)) { d = hlist_entry(tmp.first, struct nfs4_deviceid_node, tmpnode); hlist_del(&d->tmpnode); - if (atomic_dec_and_test(&d->ref)) - d->ld->free_deviceid_node(d); + nfs4_put_deviceid_node(d); } } -- cgit v1.2.3 From 4e59080397faadee59d39ffa2116dc8607adc9c9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 9 Mar 2015 14:01:25 -0400 Subject: NFSv4.1: Allow getdeviceinfo to return notification info back to caller We are only allowed to cache deviceinfo if the server supports notifications and actually promises to call us back when changes occur. Right now, we request those notifications, but then we don't check the server's reply. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 5 +++++ fs/nfs/nfs4xdr.c | 14 +++++--------- include/linux/nfs_xdr.h | 2 ++ 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 627f37c44456..ba8b2b5e98a1 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7944,6 +7944,8 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, { struct nfs4_getdeviceinfo_args args = { .pdev = pdev, + .notify_types = NOTIFY_DEVICEID4_CHANGE | + NOTIFY_DEVICEID4_DELETE, }; struct nfs4_getdeviceinfo_res res = { .pdev = pdev, @@ -7958,6 +7960,9 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, dprintk("--> %s\n", __func__); status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); + if (res.notification & ~args.notify_types) + dprintk("%s: unsupported notification\n", __func__); + dprintk("<-- %s status=%d\n", __func__, status); return status; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 5c399ec41079..6b28a605c697 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1920,7 +1920,7 @@ encode_getdeviceinfo(struct xdr_stream *xdr, p = reserve_space(xdr, 4 + 4); *p++ = cpu_to_be32(1); /* bitmap length */ - *p++ = cpu_to_be32(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE); + *p++ = cpu_to_be32(args->notify_types); } static void @@ -5753,8 +5753,9 @@ out_overflow: #if defined(CONFIG_NFS_V4_1) static int decode_getdeviceinfo(struct xdr_stream *xdr, - struct pnfs_device *pdev) + struct nfs4_getdeviceinfo_res *res) { + struct pnfs_device *pdev = res->pdev; __be32 *p; uint32_t len, type; int status; @@ -5802,12 +5803,7 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr, if (unlikely(!p)) goto out_overflow; - if (be32_to_cpup(p++) & - ~(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE)) { - dprintk("%s: unsupported notification\n", - __func__); - } - + res->notification = be32_to_cpup(p++); for (i = 1; i < len; i++) { if (be32_to_cpup(p++)) { dprintk("%s: unsupported notification\n", @@ -7061,7 +7057,7 @@ static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, status = decode_sequence(xdr, &res->seq_res, rqstp); if (status != 0) goto out; - status = decode_getdeviceinfo(xdr, res->pdev); + status = decode_getdeviceinfo(xdr, res); out: return status; } diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 4cb3eaa89cf7..3d88908fd140 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -255,11 +255,13 @@ struct nfs4_layoutget { struct nfs4_getdeviceinfo_args { struct nfs4_sequence_args seq_args; struct pnfs_device *pdev; + __u32 notify_types; }; struct nfs4_getdeviceinfo_res { struct nfs4_sequence_res seq_res; struct pnfs_device *pdev; + __u32 notification; }; struct nfs4_layoutcommit_args { -- cgit v1.2.3 From df52699e4fcefe30ebe4f1db48bd161254a0b102 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 9 Mar 2015 14:48:32 -0400 Subject: NFSv4.1: Don't cache deviceids that have no notifications The spec says that once all layouts that reference a given deviceid have been returned, then we are only allowed to continue to cache the deviceid if the metadata server supports notifications. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 2 ++ fs/nfs/pnfs.h | 2 ++ fs/nfs/pnfs_dev.c | 9 +++++++++ 3 files changed, 13 insertions(+) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ba8b2b5e98a1..9ff8c63c9cac 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7962,6 +7962,8 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (res.notification & ~args.notify_types) dprintk("%s: unsupported notification\n", __func__); + if (res.notification != args.notify_types) + pdev->nocache = 1; dprintk("<-- %s status=%d\n", __func__, status); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index a1fc16c971a7..b5654e8da936 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -203,6 +203,7 @@ struct pnfs_device { struct page **pages; unsigned int pgbase; unsigned int pglen; /* reply buffer length */ + unsigned char nocache : 1;/* May not be cached */ }; #define NFS4_PNFS_GETDEVLIST_MAXNUM 16 @@ -291,6 +292,7 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, enum { NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */ NFS_DEVICEID_UNAVAILABLE, /* device temporarily unavailable */ + NFS_DEVICEID_NOCACHE, /* device may not be cached */ }; /* pnfs_dev.c */ diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index 7e07f4ba4822..2961fcd7a2df 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -149,6 +149,8 @@ nfs4_get_device_info(struct nfs_server *server, */ d = server->pnfs_curr_ld->alloc_deviceid_node(server, pdev, gfp_flags); + if (d && pdev->nocache) + set_bit(NFS_DEVICEID_NOCACHE, &d->flags); out_free_pages: for (i = 0; i < max_pages; i++) @@ -235,6 +237,7 @@ nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld, return; } hlist_del_init_rcu(&d->node); + clear_bit(NFS_DEVICEID_NOCACHE, &d->flags); spin_unlock(&nfs4_deviceid_lock); /* balance the initial ref set in pnfs_insert_deviceid */ @@ -269,6 +272,11 @@ EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node); bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *d) { + if (test_bit(NFS_DEVICEID_NOCACHE, &d->flags)) { + if (atomic_add_unless(&d->ref, -1, 2)) + return false; + nfs4_delete_deviceid(d->ld, d->nfs_client, &d->deviceid); + } if (!atomic_dec_and_test(&d->ref)) return false; d->ld->free_deviceid_node(d); @@ -312,6 +320,7 @@ _deviceid_purge_client(const struct nfs_client *clp, long hash) if (d->nfs_client == clp && atomic_read(&d->ref)) { hlist_del_init_rcu(&d->node); hlist_add_head(&d->tmpnode, &tmp); + clear_bit(NFS_DEVICEID_NOCACHE, &d->flags); } rcu_read_unlock(); spin_unlock(&nfs4_deviceid_lock); -- cgit v1.2.3 From fc87701b918c05c2d78f8191f5bc3c6c178bdb3d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 9 Mar 2015 17:25:14 -0400 Subject: NFS: Fix free_deveiceid -> free_deviceid Make it easier to grep for these functions by name. Signed-off-by: Trond Myklebust --- fs/nfs/filelayout/filelayout.c | 4 ++-- fs/nfs/flexfilelayout/flexfilelayout.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 91e88a7ecef0..5639a2ef671a 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -1086,7 +1086,7 @@ filelayout_alloc_deviceid_node(struct nfs_server *server, } static void -filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d) +filelayout_free_deviceid_node(struct nfs4_deviceid_node *d) { nfs4_fl_free_deviceid(container_of(d, struct nfs4_file_layout_dsaddr, id_node)); } @@ -1137,7 +1137,7 @@ static struct pnfs_layoutdriver_type filelayout_type = { .read_pagelist = filelayout_read_pagelist, .write_pagelist = filelayout_write_pagelist, .alloc_deviceid_node = filelayout_alloc_deviceid_node, - .free_deviceid_node = filelayout_free_deveiceid_node, + .free_deviceid_node = filelayout_free_deviceid_node, }; static int __init nfs4filelayout_init(void) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 315cc68945b9..edfb27ea4ce2 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1414,7 +1414,7 @@ ff_layout_get_ds_info(struct inode *inode) } static void -ff_layout_free_deveiceid_node(struct nfs4_deviceid_node *d) +ff_layout_free_deviceid_node(struct nfs4_deviceid_node *d) { nfs4_ff_layout_free_deviceid(container_of(d, struct nfs4_ff_layout_ds, id_node)); @@ -1498,7 +1498,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = { .pg_read_ops = &ff_layout_pg_read_ops, .pg_write_ops = &ff_layout_pg_write_ops, .get_ds_info = ff_layout_get_ds_info, - .free_deviceid_node = ff_layout_free_deveiceid_node, + .free_deviceid_node = ff_layout_free_deviceid_node, .mark_request_commit = pnfs_layout_mark_request_commit, .clear_request_commit = pnfs_generic_clear_request_commit, .scan_commit_lists = pnfs_generic_scan_commit_lists, -- cgit v1.2.3 From 81b79afb504870a31ee1f764e81a202685cde795 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 25 Mar 2015 19:09:08 -0400 Subject: NFSv4: Allow tracing of NFSv4 fsync calls I appear to have missed this when adding the ftrace probes. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4file.c | 5 +++++ fs/nfs/nfstrace.c | 3 +++ 2 files changed, 8 insertions(+) diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 8b46389c4c5b..0de62f7cfebf 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -10,6 +10,8 @@ #include "fscache.h" #include "pnfs.h" +#include "nfstrace.h" + #ifdef CONFIG_NFS_V4_2 #include "nfs42.h" #endif @@ -100,6 +102,8 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) int ret; struct inode *inode = file_inode(file); + trace_nfs_fsync_enter(inode); + do { ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret != 0) @@ -118,6 +122,7 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) end = LLONG_MAX; } while (ret == -EAGAIN); + trace_nfs_fsync_exit(inode, ret); return ret; } diff --git a/fs/nfs/nfstrace.c b/fs/nfs/nfstrace.c index 4eb0aead69b6..c74f7af23d77 100644 --- a/fs/nfs/nfstrace.c +++ b/fs/nfs/nfstrace.c @@ -7,3 +7,6 @@ #define CREATE_TRACE_POINTS #include "nfstrace.h" + +EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_enter); +EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_exit); -- cgit v1.2.3 From 415320fc14eca40b564d1797e68895d15b51ac49 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 25 Mar 2015 13:26:18 -0400 Subject: NFSv4: Return the delegation before returning the layout in evict_inode() Minor optimisation for the case where the layout has return-on-close enabled. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4super.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index 75090feeafad..d91898193b2f 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -91,10 +91,11 @@ static void nfs4_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); clear_inode(inode); - pnfs_return_layout(inode); - pnfs_destroy_layout(NFS_I(inode)); /* If we are holding a delegation, return it! */ nfs_inode_return_delegation_noreclaim(inode); + /* Note that above delegreturn would trigger pnfs return-on-close */ + pnfs_return_layout(inode); + pnfs_destroy_layout(NFS_I(inode)); /* First call standard NFS clear_inode() code */ nfs_clear_inode(inode); } -- cgit v1.2.3 From 29559b11aef072f893cd32280dcec9d7380ca011 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 25 Mar 2015 20:10:20 -0400 Subject: NFSv4.1/pnfs: Fix setting of layoutcommit last write byte If the NFS_INO_LAYOUTCOMMIT flag was unset, then we _must_ ensure that we also reset the last write byte (lwb) for that layout. The current code depends on us clearing the lwb when we clear NFS_INO_LAYOUTCOMMIT, which is not the case when we call pnfs_clear_layoutcommit(). Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 4f802b02fbb9..b96736df98e8 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2108,16 +2108,16 @@ pnfs_set_layoutcommit(struct nfs_pgio_header *hdr) spin_lock(&inode->i_lock); if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { + nfsi->layout->plh_lwb = end_pos; mark_as_dirty = true; dprintk("%s: Set layoutcommit for inode %lu ", __func__, inode->i_ino); - } + } else if (end_pos > nfsi->layout->plh_lwb) + nfsi->layout->plh_lwb = end_pos; if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &hdr->lseg->pls_flags)) { /* references matched in nfs4_layoutcommit_release */ pnfs_get_lseg(hdr->lseg); } - if (end_pos > nfsi->layout->plh_lwb) - nfsi->layout->plh_lwb = end_pos; spin_unlock(&inode->i_lock); dprintk("%s: lseg %p end_pos %llu\n", __func__, hdr->lseg, nfsi->layout->plh_lwb); @@ -2137,16 +2137,16 @@ void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data) spin_lock(&inode->i_lock); if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { + nfsi->layout->plh_lwb = data->lwb; mark_as_dirty = true; dprintk("%s: Set layoutcommit for inode %lu ", __func__, inode->i_ino); - } + } else if (data->lwb > nfsi->layout->plh_lwb) + nfsi->layout->plh_lwb = data->lwb; if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &data->lseg->pls_flags)) { /* references matched in nfs4_layoutcommit_release */ pnfs_get_lseg(data->lseg); } - if (data->lwb > nfsi->layout->plh_lwb) - nfsi->layout->plh_lwb = data->lwb; spin_unlock(&inode->i_lock); dprintk("%s: lseg %p end_pos %llu\n", __func__, data->lseg, nfsi->layout->plh_lwb); @@ -2216,7 +2216,6 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) pnfs_list_write_lseg(inode, &data->lseg_list); end_pos = nfsi->layout->plh_lwb; - nfsi->layout->plh_lwb = 0; nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid); spin_unlock(&inode->i_lock); @@ -2233,11 +2232,11 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) status = ld->prepare_layoutcommit(&data->args); if (status) { spin_lock(&inode->i_lock); - if (end_pos < nfsi->layout->plh_lwb) + set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags); + if (end_pos > nfsi->layout->plh_lwb) nfsi->layout->plh_lwb = end_pos; spin_unlock(&inode->i_lock); put_rpccred(data->cred); - set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags); goto clear_layoutcommitting; } } -- cgit v1.2.3 From 67af7611ec57dbcbc96f9d9daa4d3c7b0999aa73 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 25 Mar 2015 20:40:38 -0400 Subject: NFSv4.1/pnfs: Refactor pnfs_set_layoutcommit() pnfs_set_layoutcommit() and pnfs_commit_set_layoutcommit() are 100% identical except for the function arguments. Refactor to eliminate the difference. Signed-off-by: Trond Myklebust --- fs/nfs/filelayout/filelayout.c | 5 ++-- fs/nfs/flexfilelayout/flexfilelayout.c | 5 ++-- fs/nfs/pnfs.c | 43 ++++++---------------------------- fs/nfs/pnfs.h | 3 +-- 4 files changed, 14 insertions(+), 42 deletions(-) diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 5639a2ef671a..a317b007e436 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -258,7 +258,8 @@ filelayout_set_layoutcommit(struct nfs_pgio_header *hdr) hdr->res.verf->committed != NFS_DATA_SYNC) return; - pnfs_set_layoutcommit(hdr); + pnfs_set_layoutcommit(hdr->inode, hdr->lseg, + hdr->mds_offset + hdr->res.count); dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); } @@ -373,7 +374,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task, } if (data->verf.committed == NFS_UNSTABLE) - pnfs_commit_set_layoutcommit(data); + pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); return 0; } diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index edfb27ea4ce2..92d2943e54f8 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -891,7 +891,8 @@ static int ff_layout_read_done_cb(struct rpc_task *task, static void ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr) { - pnfs_set_layoutcommit(hdr); + pnfs_set_layoutcommit(hdr->inode, hdr->lseg, + hdr->mds_offset + hdr->res.count); dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); } @@ -1074,7 +1075,7 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, } if (data->verf.committed == NFS_UNSTABLE) - pnfs_commit_set_layoutcommit(data); + pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); return 0; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index b96736df98e8..ea83f3c03c65 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1841,7 +1841,8 @@ void pnfs_ld_write_done(struct nfs_pgio_header *hdr) { trace_nfs4_pnfs_write(hdr, hdr->pnfs_error); if (!hdr->pnfs_error) { - pnfs_set_layoutcommit(hdr); + pnfs_set_layoutcommit(hdr->inode, hdr->lseg, + hdr->mds_offset + hdr->res.count); hdr->mds_ops->rpc_call_done(&hdr->task, hdr); } else pnfs_ld_handle_write_error(hdr); @@ -2099,11 +2100,10 @@ void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) EXPORT_SYMBOL_GPL(pnfs_set_lo_fail); void -pnfs_set_layoutcommit(struct nfs_pgio_header *hdr) +pnfs_set_layoutcommit(struct inode *inode, struct pnfs_layout_segment *lseg, + loff_t end_pos) { - struct inode *inode = hdr->inode; struct nfs_inode *nfsi = NFS_I(inode); - loff_t end_pos = hdr->mds_offset + hdr->res.count; bool mark_as_dirty = false; spin_lock(&inode->i_lock); @@ -2114,13 +2114,13 @@ pnfs_set_layoutcommit(struct nfs_pgio_header *hdr) __func__, inode->i_ino); } else if (end_pos > nfsi->layout->plh_lwb) nfsi->layout->plh_lwb = end_pos; - if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &hdr->lseg->pls_flags)) { + if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) { /* references matched in nfs4_layoutcommit_release */ - pnfs_get_lseg(hdr->lseg); + pnfs_get_lseg(lseg); } spin_unlock(&inode->i_lock); dprintk("%s: lseg %p end_pos %llu\n", - __func__, hdr->lseg, nfsi->layout->plh_lwb); + __func__, lseg, nfsi->layout->plh_lwb); /* if pnfs_layoutcommit_inode() runs between inode locks, the next one * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ @@ -2129,35 +2129,6 @@ pnfs_set_layoutcommit(struct nfs_pgio_header *hdr) } EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); -void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data) -{ - struct inode *inode = data->inode; - struct nfs_inode *nfsi = NFS_I(inode); - bool mark_as_dirty = false; - - spin_lock(&inode->i_lock); - if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { - nfsi->layout->plh_lwb = data->lwb; - mark_as_dirty = true; - dprintk("%s: Set layoutcommit for inode %lu ", - __func__, inode->i_ino); - } else if (data->lwb > nfsi->layout->plh_lwb) - nfsi->layout->plh_lwb = data->lwb; - if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &data->lseg->pls_flags)) { - /* references matched in nfs4_layoutcommit_release */ - pnfs_get_lseg(data->lseg); - } - spin_unlock(&inode->i_lock); - dprintk("%s: lseg %p end_pos %llu\n", - __func__, data->lseg, nfsi->layout->plh_lwb); - - /* if pnfs_layoutcommit_inode() runs between inode locks, the next one - * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ - if (mark_as_dirty) - mark_inode_dirty_sync(inode); -} -EXPORT_SYMBOL_GPL(pnfs_commit_set_layoutcommit); - void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) { struct nfs_server *nfss = NFS_SERVER(data->args.inode); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index b5654e8da936..66bf5e1cf93d 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -264,8 +264,7 @@ bool pnfs_roc(struct inode *ino); void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task); -void pnfs_set_layoutcommit(struct nfs_pgio_header *); -void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data); +void pnfs_set_layoutcommit(struct inode *, struct pnfs_layout_segment *, loff_t); void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); int pnfs_layoutcommit_inode(struct inode *inode, bool sync); int _pnfs_return_layout(struct inode *); -- cgit v1.2.3 From 4d346bea8f0bf6d6351037899586a85435c15b9b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 25 Mar 2015 16:38:33 -0400 Subject: NFS: Add a helper to sync both O_DIRECT and buffered writes Then apply it to nfs_setattr() and nfs_getattr(). Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index d42dff6d5e98..60f907666697 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -133,6 +133,12 @@ void nfs_evict_inode(struct inode *inode) nfs_clear_inode(inode); } +static int nfs_sync_inode(struct inode *inode) +{ + nfs_inode_dio_wait(inode); + return nfs_wb_all(inode); +} + /** * nfs_sync_mapping - helper to flush all mmapped dirty data to disk */ @@ -525,10 +531,8 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) trace_nfs_setattr_enter(inode); /* Write all dirty data */ - if (S_ISREG(inode->i_mode)) { - nfs_inode_dio_wait(inode); - nfs_wb_all(inode); - } + if (S_ISREG(inode->i_mode)) + nfs_sync_inode(inode); fattr = nfs_alloc_fattr(); if (fattr == NULL) @@ -644,8 +648,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) trace_nfs_getattr_enter(inode); /* Flush out writes to the server in order to update c/mtime. */ if (S_ISREG(inode->i_mode)) { - nfs_inode_dio_wait(inode); - err = filemap_write_and_wait(inode->i_mapping); + err = nfs_sync_inode(inode); if (err) goto out; } -- cgit v1.2.3 From d9dabc1a01656868d53a1d08309ca24d922b1376 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 26 Mar 2015 15:55:49 -0400 Subject: NFS: File unlock needs to be a metadata synchronisation point File unlock needs to update both data and metadata on the NFS server in order to act as a synchronisation point for other clients. Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index e679d24c39d3..6959cb76744b 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -780,7 +780,7 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) * Flush all pending writes before doing anything * with locks.. */ - nfs_sync_mapping(filp->f_mapping); + vfs_fsync(filp, 0); l_ctx = nfs_get_lock_context(nfs_file_open_context(filp)); if (!IS_ERR(l_ctx)) { -- cgit v1.2.3 From 9e1681c2e74bdd84bad6f74b47a4d88ad2f433df Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 25 Mar 2015 17:23:31 -0400 Subject: NFSv4: Truncating file opens should also sync O_DIRECT writes We don't just want to sync out buffered writes, but also O_DIRECT ones. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 3 ++- fs/nfs/nfs4file.c | 2 +- include/linux/nfs_fs.h | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 60f907666697..f1fc0e4c1c02 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -133,11 +133,12 @@ void nfs_evict_inode(struct inode *inode) nfs_clear_inode(inode); } -static int nfs_sync_inode(struct inode *inode) +int nfs_sync_inode(struct inode *inode) { nfs_inode_dio_wait(inode); return nfs_wb_all(inode); } +EXPORT_SYMBOL_GPL(nfs_sync_inode); /** * nfs_sync_mapping - helper to flush all mmapped dirty data to disk diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 0de62f7cfebf..8da5409e6f1a 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -59,7 +59,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) if (openflags & O_TRUNC) { attr.ia_valid |= ATTR_SIZE; attr.ia_size = 0; - nfs_wb_all(inode); + nfs_sync_inode(inode); } inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, &opened); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index b01ccf371fdc..b638eb6727c6 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -512,6 +512,7 @@ extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned * Try to write back everything synchronously (but check the * return value!) */ +extern int nfs_sync_inode(struct inode *inode); extern int nfs_wb_all(struct inode *inode); extern int nfs_wb_page(struct inode *inode, struct page* page); extern int nfs_wb_page_cancel(struct inode *inode, struct page* page); -- cgit v1.2.3 From a0815d556d1cfb686b46995f86fb081f623fa720 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 25 Mar 2015 18:58:53 -0400 Subject: NFSv4.1/pnfs: Ensure that writes respect the O_SYNC flag when doing O_DIRECT If the caller does not specify the O_SYNC flag, then it is legitimate to return from O_DIRECT without doing a pNFS layoutcommit operation. However if the file is opened O_DIRECT|O_SYNC then we'd better get it right. Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 1 + fs/nfs/file.c | 1 + fs/nfs/nfs4file.c | 1 + 3 files changed, 3 insertions(+) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 51f617e45fd1..018a06a062bd 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -1034,6 +1034,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, if (i_size_read(inode) < iocb->ki_pos) i_size_write(inode, iocb->ki_pos); spin_unlock(&inode->i_lock); + generic_write_sync(file, pos, result); } } nfs_direct_req_release(dreq); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 6959cb76744b..28228f381266 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -281,6 +281,7 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) trace_nfs_fsync_enter(inode); + nfs_inode_dio_wait(inode); do { ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret != 0) diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 8da5409e6f1a..befe7a2c6284 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -104,6 +104,7 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) trace_nfs_fsync_enter(inode); + nfs_inode_dio_wait(inode); do { ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret != 0) -- cgit v1.2.3 From 7140171ea9be4143736c35acf6f31b6feb195ca0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 25 Mar 2015 12:36:13 -0400 Subject: NFSv4.1/pnfs: Ensure we send layoutcommit before return-on-close We must not send a close or delegreturn that would result in a return-on-close of the layout without ensuring that we've also sent the necessary layoutcommit. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index ea83f3c03c65..c2ce2db771b7 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1090,6 +1090,7 @@ bool pnfs_roc(struct inode *ino) pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */ spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); + pnfs_layoutcommit_inode(ino, true); return true; out_noroc: @@ -1104,8 +1105,10 @@ out_noroc: } } spin_unlock(&ino->i_lock); - if (layoutreturn) + if (layoutreturn) { + pnfs_layoutcommit_inode(ino, true); pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); + } return false; } -- cgit v1.2.3 From 5bb89b4702e22981445ae01af733a57d1cae2018 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 25 Mar 2015 14:14:42 -0400 Subject: NFSv4.1/pnfs: Separate out metadata and data consistency for pNFS The LAYOUTCOMMIT operation means different things to different layout types. For blocks and objects, it is both a data and metadata consistency operation. For files and flexfiles, it is only a metadata consistency operation. This patch separates out the 2 cases, allowing the files/flexfiles layout drivers to optimise away the data consistency calls to layoutcommit. Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 1 + fs/nfs/filelayout/filelayout.c | 1 + fs/nfs/flexfilelayout/flexfilelayout.c | 1 + fs/nfs/nfs4file.c | 2 +- fs/nfs/objlayout/objio_osd.c | 2 ++ fs/nfs/pnfs.c | 7 +++++++ fs/nfs/pnfs.h | 18 ++++++++++++++++++ fs/nfs/pnfs_nfs.c | 10 ++++++++++ fs/nfs/write.c | 13 ++++++------- 9 files changed, 47 insertions(+), 8 deletions(-) diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 1cac3c175d18..d2554fe140a3 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -890,6 +890,7 @@ static struct pnfs_layoutdriver_type blocklayout_type = { .free_deviceid_node = bl_free_deviceid_node, .pg_read_ops = &bl_pg_read_ops, .pg_write_ops = &bl_pg_write_ops, + .sync = pnfs_generic_sync, }; static int __init nfs4blocklayout_init(void) diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index a317b007e436..a46bf6de9ce4 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -1139,6 +1139,7 @@ static struct pnfs_layoutdriver_type filelayout_type = { .write_pagelist = filelayout_write_pagelist, .alloc_deviceid_node = filelayout_alloc_deviceid_node, .free_deviceid_node = filelayout_free_deviceid_node, + .sync = pnfs_nfs_generic_sync, }; static int __init nfs4filelayout_init(void) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 92d2943e54f8..f3ff66e4fb6a 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1509,6 +1509,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = { .write_pagelist = ff_layout_write_pagelist, .alloc_deviceid_node = ff_layout_alloc_deviceid_node, .encode_layoutreturn = ff_layout_encode_layoutreturn, + .sync = pnfs_nfs_generic_sync, }; static int __init nfs4flexfilelayout_init(void) diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index befe7a2c6284..866842b048ef 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -112,7 +112,7 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) mutex_lock(&inode->i_mutex); ret = nfs_file_fsync_commit(file, start, end, datasync); if (!ret) - ret = pnfs_layoutcommit_inode(inode, true); + ret = pnfs_sync_inode(inode, !!datasync); mutex_unlock(&inode->i_mutex); /* * If nfs_file_fsync_commit detected a server reboot, then diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 8b5e0e687d5e..5aaed363556a 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -637,6 +637,8 @@ static struct pnfs_layoutdriver_type objlayout_type = { .pg_read_ops = &objio_pg_read_ops, .pg_write_ops = &objio_pg_write_ops, + .sync = pnfs_generic_sync, + .free_deviceid_node = objio_free_deviceid_node, .encode_layoutcommit = objlayout_encode_layoutcommit, diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index c2ce2db771b7..3f0affebcf05 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2231,6 +2231,13 @@ clear_layoutcommitting: } EXPORT_SYMBOL_GPL(pnfs_layoutcommit_inode); +int +pnfs_generic_sync(struct inode *inode, bool datasync) +{ + return pnfs_layoutcommit_inode(inode, true); +} +EXPORT_SYMBOL_GPL(pnfs_generic_sync); + struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) { struct nfs4_threshold *thp; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 66bf5e1cf93d..231eb23c22da 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -155,6 +155,8 @@ struct pnfs_layoutdriver_type { int how, struct nfs_commit_info *cinfo); + int (*sync)(struct inode *inode, bool datasync); + /* * Return PNFS_ATTEMPTED to indicate the layout code has attempted * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS @@ -267,6 +269,8 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task); void pnfs_set_layoutcommit(struct inode *, struct pnfs_layout_segment *, loff_t); void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); int pnfs_layoutcommit_inode(struct inode *inode, bool sync); +int pnfs_generic_sync(struct inode *inode, bool datasync); +int pnfs_nfs_generic_sync(struct inode *inode, bool datasync); int _pnfs_return_layout(struct inode *); int pnfs_commit_and_return_layout(struct inode *); void pnfs_ld_write_done(struct nfs_pgio_header *); @@ -488,6 +492,14 @@ pnfs_ld_read_whole_page(struct inode *inode) return NFS_SERVER(inode)->pnfs_curr_ld->flags & PNFS_READ_WHOLE_PAGE; } +static inline int +pnfs_sync_inode(struct inode *inode, bool datasync) +{ + if (!pnfs_enabled_sb(NFS_SERVER(inode))) + return 0; + return NFS_SERVER(inode)->pnfs_curr_ld->sync(inode, datasync); +} + static inline bool pnfs_layoutcommit_outstanding(struct inode *inode) { @@ -570,6 +582,12 @@ pnfs_ld_read_whole_page(struct inode *inode) return false; } +static inline int +pnfs_sync_inode(struct inode *inode, bool datasync) +{ + return 0; +} + static inline bool pnfs_roc(struct inode *ino) { diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 54e36b38fb5f..64d2a5932f7b 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -868,3 +868,13 @@ pnfs_layout_mark_request_commit(struct nfs_page *req, nfs_request_add_commit_list(req, list, cinfo); } EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit); + +int +pnfs_nfs_generic_sync(struct inode *inode, bool datasync) +{ + if (datasync) + return 0; + return pnfs_layoutcommit_inode(inode, true); +} +EXPORT_SYMBOL_GPL(pnfs_nfs_generic_sync); + diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 849ed784d6ac..7f933a39f385 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1840,17 +1840,16 @@ EXPORT_SYMBOL_GPL(nfs_write_inode); */ int nfs_wb_all(struct inode *inode) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, - .range_start = 0, - .range_end = LLONG_MAX, - }; int ret; trace_nfs_writeback_inode_enter(inode); - ret = sync_inode(inode, &wbc); + ret = filemap_write_and_wait(inode->i_mapping); + if (!ret) { + ret = nfs_commit_inode(inode, FLUSH_SYNC); + if (!ret) + pnfs_sync_inode(inode, true); + } trace_nfs_writeback_inode_exit(inode, ret); return ret; -- cgit v1.2.3 From 8c18d76bcba874e872410ca63c7e59b10aafa17d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 25 Mar 2015 16:40:07 -0400 Subject: NFS: Block new writes while syncing data in nfs_getattr() Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index f1fc0e4c1c02..0c3be2658546 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -649,7 +649,9 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) trace_nfs_getattr_enter(inode); /* Flush out writes to the server in order to update c/mtime. */ if (S_ISREG(inode->i_mode)) { + mutex_lock(&inode->i_mutex); err = nfs_sync_inode(inode); + mutex_unlock(&inode->i_mutex); if (err) goto out; } -- cgit v1.2.3 From dbbc14775ae395a50d91f22904670ca4c9297542 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 30 Mar 2015 14:33:33 -0400 Subject: SUNRPC: Introduce missing well-known netids Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/sunrpc/msg_prot.h | 8 +++++++- include/linux/sunrpc/xprtrdma.h | 5 ----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/include/linux/sunrpc/msg_prot.h b/include/linux/sunrpc/msg_prot.h index aadc6a04e1ac..807371357160 100644 --- a/include/linux/sunrpc/msg_prot.h +++ b/include/linux/sunrpc/msg_prot.h @@ -142,12 +142,18 @@ typedef __be32 rpc_fraghdr; (RPC_REPHDRSIZE + (2 + RPC_MAX_AUTH_SIZE/4)) /* - * RFC1833/RFC3530 rpcbind (v3+) well-known netid's. + * Well-known netids. See: + * + * http://www.iana.org/assignments/rpc-netids/rpc-netids.xhtml */ #define RPCBIND_NETID_UDP "udp" #define RPCBIND_NETID_TCP "tcp" +#define RPCBIND_NETID_RDMA "rdma" +#define RPCBIND_NETID_SCTP "sctp" #define RPCBIND_NETID_UDP6 "udp6" #define RPCBIND_NETID_TCP6 "tcp6" +#define RPCBIND_NETID_RDMA6 "rdma6" +#define RPCBIND_NETID_SCTP6 "sctp6" #define RPCBIND_NETID_LOCAL "local" /* diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h index 64a0a0a97b23..c984c85981ea 100644 --- a/include/linux/sunrpc/xprtrdma.h +++ b/include/linux/sunrpc/xprtrdma.h @@ -40,11 +40,6 @@ #ifndef _LINUX_SUNRPC_XPRTRDMA_H #define _LINUX_SUNRPC_XPRTRDMA_H -/* - * rpcbind (v3+) RDMA netid. - */ -#define RPCBIND_NETID_RDMA "rdma" - /* * Constants. Max RPC/NFS header is big enough to account for * additional marshaling buffers passed down by Linux client. -- cgit v1.2.3 From 0dd39cae26f3990789b4a558f9abafe59adc6fc1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 30 Mar 2015 14:33:43 -0400 Subject: xprtrdma: Display IPv6 addresses and port numbers correctly Signed-off-by: Chuck Lever Reviewed-by: Sagi Grimberg Tested-by: Devesh Sharma Tested-by: Meghana Cheripady Tested-by: Veeresh U. Kokatnur Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 47 +++++++++++++++++++++++++++++++++-------- net/sunrpc/xprtrdma/verbs.c | 21 ++++++++---------- 2 files changed, 47 insertions(+), 21 deletions(-) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 2e192baa59f3..9be7f97205ba 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -156,13 +156,48 @@ static struct ctl_table sunrpc_table[] = { static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */ +static void +xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap) +{ + struct sockaddr_in *sin = (struct sockaddr_in *)sap; + char buf[20]; + + snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr)); + xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL); + + xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA; +} + +static void +xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap) +{ + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + char buf[40]; + + snprintf(buf, sizeof(buf), "%pi6", &sin6->sin6_addr); + xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL); + + xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA6; +} + static void xprt_rdma_format_addresses(struct rpc_xprt *xprt) { struct sockaddr *sap = (struct sockaddr *) &rpcx_to_rdmad(xprt).addr; - struct sockaddr_in *sin = (struct sockaddr_in *)sap; - char buf[64]; + char buf[128]; + + switch (sap->sa_family) { + case AF_INET: + xprt_rdma_format_addresses4(xprt, sap); + break; + case AF_INET6: + xprt_rdma_format_addresses6(xprt, sap); + break; + default: + pr_err("rpcrdma: Unrecognized address family\n"); + return; + } (void)rpc_ntop(sap, buf, sizeof(buf)); xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL); @@ -170,16 +205,10 @@ xprt_rdma_format_addresses(struct rpc_xprt *xprt) snprintf(buf, sizeof(buf), "%u", rpc_get_port(sap)); xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL); - xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma"; - - snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr)); - xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL); - snprintf(buf, sizeof(buf), "%4hx", rpc_get_port(sap)); xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL); - /* netid */ - xprt->address_strings[RPC_DISPLAY_NETID] = "rdma"; + xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma"; } static void diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 124676c13780..1aa55b74b1b7 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include "xprt_rdma.h" @@ -424,7 +425,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) struct rpcrdma_ia *ia = &xprt->rx_ia; struct rpcrdma_ep *ep = &xprt->rx_ep; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr; + struct sockaddr *sap = (struct sockaddr *)&ep->rep_remote_addr; #endif struct ib_qp_attr *attr = &ia->ri_qp_attr; struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr; @@ -480,9 +481,8 @@ connected: wake_up_all(&ep->rep_connect_wait); /*FALLTHROUGH*/ default: - dprintk("RPC: %s: %pI4:%u (ep 0x%p): %s\n", - __func__, &addr->sin_addr.s_addr, - ntohs(addr->sin_port), ep, + dprintk("RPC: %s: %pIS:%u (ep 0x%p): %s\n", + __func__, sap, rpc_get_port(sap), ep, CONNECTION_MSG(event->event)); break; } @@ -491,19 +491,16 @@ connected: if (connstate == 1) { int ird = attr->max_dest_rd_atomic; int tird = ep->rep_remote_cma.responder_resources; - printk(KERN_INFO "rpcrdma: connection to %pI4:%u " - "on %s, memreg %d slots %d ird %d%s\n", - &addr->sin_addr.s_addr, - ntohs(addr->sin_port), + + pr_info("rpcrdma: connection to %pIS:%u on %s, memreg %d slots %d ird %d%s\n", + sap, rpc_get_port(sap), ia->ri_id->device->name, ia->ri_memreg_strategy, xprt->rx_buf.rb_max_requests, ird, ird < 4 && ird < tird / 2 ? " (low!)" : ""); } else if (connstate < 0) { - printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n", - &addr->sin_addr.s_addr, - ntohs(addr->sin_port), - connstate); + pr_info("rpcrdma: connection to %pIS:%u closed (%d)\n", + sap, rpc_get_port(sap), connstate); } #endif -- cgit v1.2.3 From e23779451ee05e824fedbb68bd17fc5c77e40166 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 30 Mar 2015 14:33:53 -0400 Subject: xprtrdma: Perform a full marshal on retransmit Commit 6ab59945f292 ("xprtrdma: Update rkeys after transport reconnect" added logic in the ->send_request path to update the chunk list when an RPC/RDMA request is retransmitted. Note that rpc_xdr_encode() resets and re-encodes the entire RPC send buffer for each retransmit of an RPC. The RPC send buffer is not preserved from the previous transmission of an RPC. Revert 6ab59945f292, and instead, just force each request to be fully marshaled every time through ->send_request. This should preserve the fix from 6ab59945f292, while also performing pullup during retransmits. Signed-off-by: Chuck Lever Acked-by: Sagi Grimberg Tested-by: Devesh Sharma Tested-by: Meghana Cheripady Tested-by: Veeresh U. Kokatnur Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 71 +++++++++++++++++++---------------------- net/sunrpc/xprtrdma/transport.c | 5 +-- net/sunrpc/xprtrdma/xprt_rdma.h | 10 ------ 3 files changed, 34 insertions(+), 52 deletions(-) diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 91ffde82fa0c..41456d9e5a7d 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -53,6 +53,14 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif +enum rpcrdma_chunktype { + rpcrdma_noch = 0, + rpcrdma_readch, + rpcrdma_areadch, + rpcrdma_writech, + rpcrdma_replych +}; + #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) static const char transfertypes[][12] = { "pure inline", /* no chunks */ @@ -283,28 +291,6 @@ out: return n; } -/* - * Marshal chunks. This routine returns the header length - * consumed by marshaling. - * - * Returns positive RPC/RDMA header size, or negative errno. - */ - -ssize_t -rpcrdma_marshal_chunks(struct rpc_rqst *rqst, ssize_t result) -{ - struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - struct rpcrdma_msg *headerp = rdmab_to_msg(req->rl_rdmabuf); - - if (req->rl_rtype != rpcrdma_noch) - result = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf, - headerp, req->rl_rtype); - else if (req->rl_wtype != rpcrdma_noch) - result = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf, - headerp, req->rl_wtype); - return result; -} - /* * Copy write data inline. * This function is used for "small" requests. Data which is passed @@ -397,6 +383,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) char *base; size_t rpclen, padlen; ssize_t hdrlen; + enum rpcrdma_chunktype rtype, wtype; struct rpcrdma_msg *headerp; /* @@ -433,13 +420,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * into pages; otherwise use reply chunks. */ if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst)) - req->rl_wtype = rpcrdma_noch; + wtype = rpcrdma_noch; else if (rqst->rq_rcv_buf.page_len == 0) - req->rl_wtype = rpcrdma_replych; + wtype = rpcrdma_replych; else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) - req->rl_wtype = rpcrdma_writech; + wtype = rpcrdma_writech; else - req->rl_wtype = rpcrdma_replych; + wtype = rpcrdma_replych; /* * Chunks needed for arguments? @@ -456,16 +443,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * TBD check NFSv4 setacl */ if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst)) - req->rl_rtype = rpcrdma_noch; + rtype = rpcrdma_noch; else if (rqst->rq_snd_buf.page_len == 0) - req->rl_rtype = rpcrdma_areadch; + rtype = rpcrdma_areadch; else - req->rl_rtype = rpcrdma_readch; + rtype = rpcrdma_readch; /* The following simplification is not true forever */ - if (req->rl_rtype != rpcrdma_noch && req->rl_wtype == rpcrdma_replych) - req->rl_wtype = rpcrdma_noch; - if (req->rl_rtype != rpcrdma_noch && req->rl_wtype != rpcrdma_noch) { + if (rtype != rpcrdma_noch && wtype == rpcrdma_replych) + wtype = rpcrdma_noch; + if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) { dprintk("RPC: %s: cannot marshal multiple chunk lists\n", __func__); return -EIO; @@ -479,7 +466,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * When padding is in use and applies to the transfer, insert * it and change the message type. */ - if (req->rl_rtype == rpcrdma_noch) { + if (rtype == rpcrdma_noch) { padlen = rpcrdma_inline_pullup(rqst, RPCRDMA_INLINE_PAD_VALUE(rqst)); @@ -494,7 +481,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; hdrlen += 2 * sizeof(u32); /* extra words in padhdr */ - if (req->rl_wtype != rpcrdma_noch) { + if (wtype != rpcrdma_noch) { dprintk("RPC: %s: invalid chunk list\n", __func__); return -EIO; @@ -515,18 +502,26 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * on receive. Therefore, we request a reply chunk * for non-writes wherever feasible and efficient. */ - if (req->rl_wtype == rpcrdma_noch) - req->rl_wtype = rpcrdma_replych; + if (wtype == rpcrdma_noch) + wtype = rpcrdma_replych; } } - hdrlen = rpcrdma_marshal_chunks(rqst, hdrlen); + if (rtype != rpcrdma_noch) { + hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf, + headerp, rtype); + wtype = rtype; /* simplify dprintk */ + + } else if (wtype != rpcrdma_noch) { + hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf, + headerp, wtype); + } if (hdrlen < 0) return hdrlen; dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" " headerp 0x%p base 0x%p lkey 0x%x\n", - __func__, transfertypes[req->rl_wtype], hdrlen, rpclen, padlen, + __func__, transfertypes[wtype], hdrlen, rpclen, padlen, headerp, base, rdmab_lkey(req->rl_rdmabuf)); /* diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 9be7f97205ba..97f656292feb 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -608,10 +608,7 @@ xprt_rdma_send_request(struct rpc_task *task) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); int rc = 0; - if (req->rl_niovs == 0) - rc = rpcrdma_marshal_req(rqst); - else if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_ALLPHYSICAL) - rc = rpcrdma_marshal_chunks(rqst, 0); + rc = rpcrdma_marshal_req(rqst); if (rc < 0) goto failed_marshal; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 0a16fb6f0885..c8afd83e8b75 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -143,14 +143,6 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb) return (struct rpcrdma_msg *)rb->rg_base; } -enum rpcrdma_chunktype { - rpcrdma_noch = 0, - rpcrdma_readch, - rpcrdma_areadch, - rpcrdma_writech, - rpcrdma_replych -}; - /* * struct rpcrdma_rep -- this structure encapsulates state required to recv * and complete a reply, asychronously. It needs several pieces of @@ -258,7 +250,6 @@ struct rpcrdma_req { unsigned int rl_niovs; /* 0, 2 or 4 */ unsigned int rl_nchunks; /* non-zero if chunks */ unsigned int rl_connect_cookie; /* retry detection */ - enum rpcrdma_chunktype rl_rtype, rl_wtype; struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ struct ib_sge rl_send_iov[4]; /* for active requests */ @@ -418,7 +409,6 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *); /* * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c */ -ssize_t rpcrdma_marshal_chunks(struct rpc_rqst *, ssize_t); int rpcrdma_marshal_req(struct rpc_rqst *); size_t rpcrdma_max_payload(struct rpcrdma_xprt *); -- cgit v1.2.3 From 805272406a980cab0e11742e5423ba97b6f38836 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 30 Mar 2015 14:34:02 -0400 Subject: xprtrdma: Byte-align FRWR registration The RPC/RDMA transport's FRWR registration logic registers whole pages. This means areas in the first and last pages that are not involved in the RDMA I/O are needlessly exposed to the server. Buffered I/O is typically page-aligned, so not a problem there. But for direct I/O, which can be byte-aligned, and for reply chunks, which are nearly always smaller than a page, the transport could expose memory outside the I/O buffer. FRWR allows byte-aligned memory registration, so let's use it as it was intended. Reported-by: Sagi Grimberg Signed-off-by: Chuck Lever Tested-by: Devesh Sharma Tested-by: Meghana Cheripady Tested-by: Veeresh U. Kokatnur Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 1aa55b74b1b7..60f3317c90ee 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1924,23 +1924,19 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) break; } - dprintk("RPC: %s: Using frmr %p to map %d segments\n", - __func__, mw, i); + dprintk("RPC: %s: Using frmr %p to map %d segments (%d bytes)\n", + __func__, mw, i, len); frmr->fr_state = FRMR_IS_VALID; memset(&fastreg_wr, 0, sizeof(fastreg_wr)); fastreg_wr.wr_id = (unsigned long)(void *)mw; fastreg_wr.opcode = IB_WR_FAST_REG_MR; - fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma; + fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma + pageoff; fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl; fastreg_wr.wr.fast_reg.page_list_len = page_no; fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; - fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT; - if (fastreg_wr.wr.fast_reg.length < len) { - rc = -EIO; - goto out_err; - } + fastreg_wr.wr.fast_reg.length = len; /* Bump the key */ key = (u8)(mr->rkey & 0x000000FF); -- cgit v1.2.3 From 41f97028969e4c88efa5fcf58bc6125210413a6d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 30 Mar 2015 14:34:12 -0400 Subject: xprtrdma: Prevent infinite loop in rpcrdma_ep_create() If a provider advertizes a zero max_fast_reg_page_list_len, FRWR depth detection loops forever. Instead of just failing the mount, try other memory registration modes. Fixes: 0fc6c4e7bb28 ("xprtrdma: mind the device's max fast . . .") Reported-by: Devesh Sharma Signed-off-by: Chuck Lever Tested-by: Devesh Sharma Tested-by: Meghana Cheripady Tested-by: Veeresh U. Kokatnur Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 60f3317c90ee..99752b5b7354 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -618,9 +618,10 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) if (memreg == RPCRDMA_FRMR) { /* Requires both frmr reg and local dma lkey */ - if ((devattr->device_cap_flags & + if (((devattr->device_cap_flags & (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != - (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) { + (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) || + (devattr->max_fast_reg_page_list_len == 0)) { dprintk("RPC: %s: FRMR registration " "not supported by HCA\n", __func__); memreg = RPCRDMA_MTHCAFMR; -- cgit v1.2.3 From a0ce85f595c22d28bf03c3fae8545b3077b7be1b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 30 Mar 2015 14:34:21 -0400 Subject: xprtrdma: Add vector of ops for each memory registration strategy Instead of employing switch() statements, let's use the typical Linux kernel idiom for handling behavioral variation: virtual functions. Start by defining a vector of operations for each supported memory registration mode, and by adding a source file for each mode. Signed-off-by: Chuck Lever Reviewed-by: Sagi Grimberg Tested-by: Devesh Sharma Tested-by: Meghana Cheripady Tested-by: Veeresh U. Kokatnur Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/Makefile | 3 ++- net/sunrpc/xprtrdma/fmr_ops.c | 22 ++++++++++++++++++++++ net/sunrpc/xprtrdma/frwr_ops.c | 22 ++++++++++++++++++++++ net/sunrpc/xprtrdma/physical_ops.c | 24 ++++++++++++++++++++++++ net/sunrpc/xprtrdma/verbs.c | 11 +++++++---- net/sunrpc/xprtrdma/xprt_rdma.h | 12 ++++++++++++ 6 files changed, 89 insertions(+), 5 deletions(-) create mode 100644 net/sunrpc/xprtrdma/fmr_ops.c create mode 100644 net/sunrpc/xprtrdma/frwr_ops.c create mode 100644 net/sunrpc/xprtrdma/physical_ops.c diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile index da5136fd5694..579f72bbcf4b 100644 --- a/net/sunrpc/xprtrdma/Makefile +++ b/net/sunrpc/xprtrdma/Makefile @@ -1,6 +1,7 @@ obj-$(CONFIG_SUNRPC_XPRT_RDMA_CLIENT) += xprtrdma.o -xprtrdma-y := transport.o rpc_rdma.o verbs.o +xprtrdma-y := transport.o rpc_rdma.o verbs.o \ + fmr_ops.o frwr_ops.o physical_ops.o obj-$(CONFIG_SUNRPC_XPRT_RDMA_SERVER) += svcrdma.o diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c new file mode 100644 index 000000000000..ffb7d9358480 --- /dev/null +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + */ + +/* Lightweight memory registration using Fast Memory Regions (FMR). + * Referred to sometimes as MTHCAFMR mode. + * + * FMR uses synchronous memory registration and deregistration. + * FMR registration is known to be fast, but FMR deregistration + * can take tens of usecs to complete. + */ + +#include "xprt_rdma.h" + +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { + .ro_displayname = "fmr", +}; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c new file mode 100644 index 000000000000..79173f98e09a --- /dev/null +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + */ + +/* Lightweight memory registration using Fast Registration Work + * Requests (FRWR). Also referred to sometimes as FRMR mode. + * + * FRWR features ordered asynchronous registration and deregistration + * of arbitrarily sized memory regions. This is the fastest and safest + * but most complex memory registration mode. + */ + +#include "xprt_rdma.h" + +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { + .ro_displayname = "frwr", +}; diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c new file mode 100644 index 000000000000..b0922accabcf --- /dev/null +++ b/net/sunrpc/xprtrdma/physical_ops.c @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + */ + +/* No-op chunk preparation. All client memory is pre-registered. + * Sometimes referred to as ALLPHYSICAL mode. + * + * Physical registration is simple because all client memory is + * pre-registered and never deregistered. This mode is good for + * adapter bring up, but is considered not safe: the server is + * trusted not to abuse its access to client memory not involved + * in RDMA I/O. + */ + +#include "xprt_rdma.h" + +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = { + .ro_displayname = "physical", +}; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 99752b5b7354..c3319e12551c 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -492,10 +492,10 @@ connected: int ird = attr->max_dest_rd_atomic; int tird = ep->rep_remote_cma.responder_resources; - pr_info("rpcrdma: connection to %pIS:%u on %s, memreg %d slots %d ird %d%s\n", + pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n", sap, rpc_get_port(sap), ia->ri_id->device->name, - ia->ri_memreg_strategy, + ia->ri_ops->ro_displayname, xprt->rx_buf.rb_max_requests, ird, ird < 4 && ird < tird / 2 ? " (low!)" : ""); } else if (connstate < 0) { @@ -650,13 +650,16 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) */ switch (memreg) { case RPCRDMA_FRMR: + ia->ri_ops = &rpcrdma_frwr_memreg_ops; break; case RPCRDMA_ALLPHYSICAL: + ia->ri_ops = &rpcrdma_physical_memreg_ops; mem_priv = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; goto register_setup; case RPCRDMA_MTHCAFMR: + ia->ri_ops = &rpcrdma_fmr_memreg_ops; if (ia->ri_have_dma_lkey) break; mem_priv = IB_ACCESS_LOCAL_WRITE; @@ -676,8 +679,8 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) rc = -ENOMEM; goto out3; } - dprintk("RPC: %s: memory registration strategy is %d\n", - __func__, memreg); + dprintk("RPC: %s: memory registration strategy is '%s'\n", + __func__, ia->ri_ops->ro_displayname); /* Else will do memory reg/dereg for each chunk */ ia->ri_memreg_strategy = memreg; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index c8afd83e8b75..ef3cf4aeecd6 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -60,6 +60,7 @@ * Interface Adapter -- one per transport instance */ struct rpcrdma_ia { + const struct rpcrdma_memreg_ops *ri_ops; rwlock_t ri_qplock; struct rdma_cm_id *ri_id; struct ib_pd *ri_pd; @@ -330,6 +331,17 @@ struct rpcrdma_stats { unsigned long bad_reply_count; }; +/* + * Per-registration mode operations + */ +struct rpcrdma_memreg_ops { + const char *ro_displayname; +}; + +extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops; +extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops; +extern const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops; + /* * RPCRDMA transport -- encapsulates the structures above for * integration with RPC. -- cgit v1.2.3 From 1c9351ee0e346ec1b3c700a4bc8f881923e1808e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 30 Mar 2015 14:34:30 -0400 Subject: xprtrdma: Add a "max_payload" op for each memreg mode The max_payload computation is generalized to ensure that the payload maximum is the lesser of RPC_MAX_DATA_SEGS and the number of data segments that can be transmitted in an inline buffer. Signed-off-by: Chuck Lever Reviewed-by: Sagi Grimberg Tested-by: Devesh Sharma Tested-by: Meghana Cheripady Tested-by: Veeresh U. Kokatnur Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/fmr_ops.c | 13 ++++++++++ net/sunrpc/xprtrdma/frwr_ops.c | 13 ++++++++++ net/sunrpc/xprtrdma/physical_ops.c | 10 ++++++++ net/sunrpc/xprtrdma/transport.c | 5 +++- net/sunrpc/xprtrdma/verbs.c | 49 ++++++++++++-------------------------- net/sunrpc/xprtrdma/xprt_rdma.h | 5 +++- 6 files changed, 59 insertions(+), 36 deletions(-) diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index ffb7d9358480..eec266055b28 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -17,6 +17,19 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif +/* Maximum scatter/gather per FMR */ +#define RPCRDMA_MAX_FMR_SGES (64) + +/* FMR mode conveys up to 64 pages of payload per chunk segment. + */ +static size_t +fmr_op_maxpages(struct rpcrdma_xprt *r_xprt) +{ + return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, + rpcrdma_max_segments(r_xprt) * RPCRDMA_MAX_FMR_SGES); +} + const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { + .ro_maxpages = fmr_op_maxpages, .ro_displayname = "fmr", }; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 79173f98e09a..73a5ac898efc 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -17,6 +17,19 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif +/* FRWR mode conveys a list of pages per chunk segment. The + * maximum length of that list is the FRWR page list depth. + */ +static size_t +frwr_op_maxpages(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + + return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, + rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth); +} + const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { + .ro_maxpages = frwr_op_maxpages, .ro_displayname = "frwr", }; diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c index b0922accabcf..28ade1943a57 100644 --- a/net/sunrpc/xprtrdma/physical_ops.c +++ b/net/sunrpc/xprtrdma/physical_ops.c @@ -19,6 +19,16 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif +/* PHYSICAL memory registration conveys one page per chunk segment. + */ +static size_t +physical_op_maxpages(struct rpcrdma_xprt *r_xprt) +{ + return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, + rpcrdma_max_segments(r_xprt)); +} + const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = { + .ro_maxpages = physical_op_maxpages, .ro_displayname = "physical", }; diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 97f656292feb..da71a24641e3 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -406,7 +406,10 @@ xprt_setup_rdma(struct xprt_create *args) xprt_rdma_connect_worker); xprt_rdma_format_addresses(xprt); - xprt->max_payload = rpcrdma_max_payload(new_xprt); + xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt); + if (xprt->max_payload == 0) + goto out4; + xprt->max_payload <<= PAGE_SHIFT; dprintk("RPC: %s: transport data payload maximum: %zu bytes\n", __func__, xprt->max_payload); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index c3319e12551c..da55cda30568 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -2212,43 +2212,24 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, return rc; } -/* Physical mapping means one Read/Write list entry per-page. - * All list entries must fit within an inline buffer - * - * NB: The server must return a Write list for NFS READ, - * which has the same constraint. Factor in the inline - * rsize as well. +/* How many chunk list items fit within our inline buffers? */ -static size_t -rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt) +unsigned int +rpcrdma_max_segments(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; - unsigned int inline_size, pages; - - inline_size = min_t(unsigned int, - cdata->inline_wsize, cdata->inline_rsize); - inline_size -= RPCRDMA_HDRLEN_MIN; - pages = inline_size / sizeof(struct rpcrdma_segment); - return pages << PAGE_SHIFT; -} + int bytes, segments; -static size_t -rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt) -{ - return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT; -} - -size_t -rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt) -{ - size_t result; - - switch (r_xprt->rx_ia.ri_memreg_strategy) { - case RPCRDMA_ALLPHYSICAL: - result = rpcrdma_physical_max_payload(r_xprt); - break; - default: - result = rpcrdma_mr_max_payload(r_xprt); + bytes = min_t(unsigned int, cdata->inline_wsize, cdata->inline_rsize); + bytes -= RPCRDMA_HDRLEN_MIN; + if (bytes < sizeof(struct rpcrdma_segment) * 2) { + pr_warn("RPC: %s: inline threshold too small\n", + __func__); + return 0; } - return result; + + segments = 1 << (fls(bytes / sizeof(struct rpcrdma_segment)) - 1); + dprintk("RPC: %s: max chunk list size = %d segments\n", + __func__, segments); + return segments; } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index ef3cf4aeecd6..59e627e96a0b 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -334,7 +334,9 @@ struct rpcrdma_stats { /* * Per-registration mode operations */ +struct rpcrdma_xprt; struct rpcrdma_memreg_ops { + size_t (*ro_maxpages)(struct rpcrdma_xprt *); const char *ro_displayname; }; @@ -411,6 +413,8 @@ struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *, void rpcrdma_free_regbuf(struct rpcrdma_ia *, struct rpcrdma_regbuf *); +unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *); + /* * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c */ @@ -422,7 +426,6 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *); * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c */ int rpcrdma_marshal_req(struct rpc_rqst *); -size_t rpcrdma_max_payload(struct rpcrdma_xprt *); /* Temporary NFS request map cache. Created in svc_rdma.c */ extern struct kmem_cache *svc_rdma_map_cachep; -- cgit v1.2.3 From 9c1b4d775f2d7dd5bb806e3de2f3e1244a7cbd16 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 30 Mar 2015 14:34:39 -0400 Subject: xprtrdma: Add a "register_external" op for each memreg mode There is very little common processing among the different external memory registration functions. Have rpcrdma_create_chunks() call the registration method directly. This removes a stack frame and a switch statement from the external registration path. Signed-off-by: Chuck Lever Tested-by: Devesh Sharma Tested-by: Meghana Cheripady Tested-by: Veeresh U. Kokatnur Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/fmr_ops.c | 51 +++++++++++ net/sunrpc/xprtrdma/frwr_ops.c | 82 ++++++++++++++++++ net/sunrpc/xprtrdma/physical_ops.c | 17 ++++ net/sunrpc/xprtrdma/rpc_rdma.c | 5 +- net/sunrpc/xprtrdma/verbs.c | 168 +------------------------------------ net/sunrpc/xprtrdma/xprt_rdma.h | 6 +- 6 files changed, 160 insertions(+), 169 deletions(-) diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index eec266055b28..45fb646a6d36 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -29,7 +29,58 @@ fmr_op_maxpages(struct rpcrdma_xprt *r_xprt) rpcrdma_max_segments(r_xprt) * RPCRDMA_MAX_FMR_SGES); } +/* Use the ib_map_phys_fmr() verb to register a memory region + * for remote access via RDMA READ or RDMA WRITE. + */ +static int +fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, + int nsegs, bool writing) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_mr_seg *seg1 = seg; + struct rpcrdma_mw *mw = seg1->rl_mw; + u64 physaddrs[RPCRDMA_MAX_DATA_SEGS]; + int len, pageoff, i, rc; + + pageoff = offset_in_page(seg1->mr_offset); + seg1->mr_offset -= pageoff; /* start of page */ + seg1->mr_len += pageoff; + len = -pageoff; + if (nsegs > RPCRDMA_MAX_FMR_SGES) + nsegs = RPCRDMA_MAX_FMR_SGES; + for (i = 0; i < nsegs;) { + rpcrdma_map_one(ia, seg, writing); + physaddrs[i] = seg->mr_dma; + len += seg->mr_len; + ++seg; + ++i; + /* Check for holes */ + if ((i < nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) + break; + } + + rc = ib_map_phys_fmr(mw->r.fmr, physaddrs, i, seg1->mr_dma); + if (rc) + goto out_maperr; + + seg1->mr_rkey = mw->r.fmr->rkey; + seg1->mr_base = seg1->mr_dma + pageoff; + seg1->mr_nsegs = i; + seg1->mr_len = len; + return i; + +out_maperr: + dprintk("RPC: %s: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", + __func__, len, (unsigned long long)seg1->mr_dma, + pageoff, i, rc); + while (i--) + rpcrdma_unmap_one(ia, --seg); + return rc; +} + const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { + .ro_map = fmr_op_map, .ro_maxpages = fmr_op_maxpages, .ro_displayname = "fmr", }; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 73a5ac898efc..23e4d99a2097 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -29,7 +29,89 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt) rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth); } +/* Post a FAST_REG Work Request to register a memory region + * for remote access via RDMA READ or RDMA WRITE. + */ +static int +frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, + int nsegs, bool writing) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_mr_seg *seg1 = seg; + struct rpcrdma_mw *mw = seg1->rl_mw; + struct rpcrdma_frmr *frmr = &mw->r.frmr; + struct ib_mr *mr = frmr->fr_mr; + struct ib_send_wr fastreg_wr, *bad_wr; + u8 key; + int len, pageoff; + int i, rc; + int seg_len; + u64 pa; + int page_no; + + pageoff = offset_in_page(seg1->mr_offset); + seg1->mr_offset -= pageoff; /* start of page */ + seg1->mr_len += pageoff; + len = -pageoff; + if (nsegs > ia->ri_max_frmr_depth) + nsegs = ia->ri_max_frmr_depth; + for (page_no = i = 0; i < nsegs;) { + rpcrdma_map_one(ia, seg, writing); + pa = seg->mr_dma; + for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) { + frmr->fr_pgl->page_list[page_no++] = pa; + pa += PAGE_SIZE; + } + len += seg->mr_len; + ++seg; + ++i; + /* Check for holes */ + if ((i < nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) + break; + } + dprintk("RPC: %s: Using frmr %p to map %d segments (%d bytes)\n", + __func__, mw, i, len); + + frmr->fr_state = FRMR_IS_VALID; + + memset(&fastreg_wr, 0, sizeof(fastreg_wr)); + fastreg_wr.wr_id = (unsigned long)(void *)mw; + fastreg_wr.opcode = IB_WR_FAST_REG_MR; + fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma + pageoff; + fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl; + fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + fastreg_wr.wr.fast_reg.page_list_len = page_no; + fastreg_wr.wr.fast_reg.length = len; + fastreg_wr.wr.fast_reg.access_flags = writing ? + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : + IB_ACCESS_REMOTE_READ; + key = (u8)(mr->rkey & 0x000000FF); + ib_update_fast_reg_key(mr, ++key); + fastreg_wr.wr.fast_reg.rkey = mr->rkey; + + DECR_CQCOUNT(&r_xprt->rx_ep); + rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr); + if (rc) + goto out_senderr; + + seg1->mr_rkey = mr->rkey; + seg1->mr_base = seg1->mr_dma + pageoff; + seg1->mr_nsegs = i; + seg1->mr_len = len; + return i; + +out_senderr: + dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc); + ib_update_fast_reg_key(mr, --key); + frmr->fr_state = FRMR_IS_INVALID; + while (i--) + rpcrdma_unmap_one(ia, --seg); + return rc; +} + const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { + .ro_map = frwr_op_map, .ro_maxpages = frwr_op_maxpages, .ro_displayname = "frwr", }; diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c index 28ade1943a57..5a284ee0e058 100644 --- a/net/sunrpc/xprtrdma/physical_ops.c +++ b/net/sunrpc/xprtrdma/physical_ops.c @@ -28,7 +28,24 @@ physical_op_maxpages(struct rpcrdma_xprt *r_xprt) rpcrdma_max_segments(r_xprt)); } +/* The client's physical memory is already exposed for + * remote access via RDMA READ or RDMA WRITE. + */ +static int +physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, + int nsegs, bool writing) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + + rpcrdma_map_one(ia, seg, writing); + seg->mr_rkey = ia->ri_bind_mem->rkey; + seg->mr_base = seg->mr_dma; + seg->mr_nsegs = 1; + return 1; +} + const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = { + .ro_map = physical_op_map, .ro_maxpages = physical_op_maxpages, .ro_displayname = "physical", }; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 41456d9e5a7d..6ab1d03d7f3e 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -187,6 +187,7 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, struct rpcrdma_write_array *warray = NULL; struct rpcrdma_write_chunk *cur_wchunk = NULL; __be32 *iptr = headerp->rm_body.rm_chunks; + int (*map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool); if (type == rpcrdma_readch || type == rpcrdma_areadch) { /* a read chunk - server will RDMA Read our memory */ @@ -209,9 +210,9 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, if (nsegs < 0) return nsegs; + map = r_xprt->rx_ia.ri_ops->ro_map; do { - n = rpcrdma_register_external(seg, nsegs, - cur_wchunk != NULL, r_xprt); + n = map(r_xprt, seg, nsegs, cur_wchunk != NULL); if (n <= 0) goto out; if (cur_rchunk) { /* read */ diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index da55cda30568..4318c04e095c 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1858,8 +1858,8 @@ rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) * Wrappers for chunk registration, shared by read/write chunk code. */ -static void -rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing) +void +rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, bool writing) { seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; seg->mr_dmalen = seg->mr_len; @@ -1879,7 +1879,7 @@ rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing) } } -static void +void rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg) { if (seg->mr_page) @@ -1890,89 +1890,6 @@ rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg) seg->mr_dma, seg->mr_dmalen, seg->mr_dir); } -static int -rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, - int *nsegs, int writing, struct rpcrdma_ia *ia, - struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_mr_seg *seg1 = seg; - struct rpcrdma_mw *mw = seg1->rl_mw; - struct rpcrdma_frmr *frmr = &mw->r.frmr; - struct ib_mr *mr = frmr->fr_mr; - struct ib_send_wr fastreg_wr, *bad_wr; - u8 key; - int len, pageoff; - int i, rc; - int seg_len; - u64 pa; - int page_no; - - pageoff = offset_in_page(seg1->mr_offset); - seg1->mr_offset -= pageoff; /* start of page */ - seg1->mr_len += pageoff; - len = -pageoff; - if (*nsegs > ia->ri_max_frmr_depth) - *nsegs = ia->ri_max_frmr_depth; - for (page_no = i = 0; i < *nsegs;) { - rpcrdma_map_one(ia, seg, writing); - pa = seg->mr_dma; - for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) { - frmr->fr_pgl->page_list[page_no++] = pa; - pa += PAGE_SIZE; - } - len += seg->mr_len; - ++seg; - ++i; - /* Check for holes */ - if ((i < *nsegs && offset_in_page(seg->mr_offset)) || - offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) - break; - } - dprintk("RPC: %s: Using frmr %p to map %d segments (%d bytes)\n", - __func__, mw, i, len); - - frmr->fr_state = FRMR_IS_VALID; - - memset(&fastreg_wr, 0, sizeof(fastreg_wr)); - fastreg_wr.wr_id = (unsigned long)(void *)mw; - fastreg_wr.opcode = IB_WR_FAST_REG_MR; - fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma + pageoff; - fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl; - fastreg_wr.wr.fast_reg.page_list_len = page_no; - fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; - fastreg_wr.wr.fast_reg.length = len; - - /* Bump the key */ - key = (u8)(mr->rkey & 0x000000FF); - ib_update_fast_reg_key(mr, ++key); - - fastreg_wr.wr.fast_reg.access_flags = (writing ? - IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : - IB_ACCESS_REMOTE_READ); - fastreg_wr.wr.fast_reg.rkey = mr->rkey; - DECR_CQCOUNT(&r_xprt->rx_ep); - - rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr); - if (rc) { - dprintk("RPC: %s: failed ib_post_send for register," - " status %i\n", __func__, rc); - ib_update_fast_reg_key(mr, --key); - goto out_err; - } else { - seg1->mr_rkey = mr->rkey; - seg1->mr_base = seg1->mr_dma + pageoff; - seg1->mr_nsegs = i; - seg1->mr_len = len; - } - *nsegs = i; - return 0; -out_err: - frmr->fr_state = FRMR_IS_INVALID; - while (i--) - rpcrdma_unmap_one(ia, --seg); - return rc; -} - static int rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt) @@ -2003,49 +1920,6 @@ rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, return rc; } -static int -rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg, - int *nsegs, int writing, struct rpcrdma_ia *ia) -{ - struct rpcrdma_mr_seg *seg1 = seg; - u64 physaddrs[RPCRDMA_MAX_DATA_SEGS]; - int len, pageoff, i, rc; - - pageoff = offset_in_page(seg1->mr_offset); - seg1->mr_offset -= pageoff; /* start of page */ - seg1->mr_len += pageoff; - len = -pageoff; - if (*nsegs > RPCRDMA_MAX_DATA_SEGS) - *nsegs = RPCRDMA_MAX_DATA_SEGS; - for (i = 0; i < *nsegs;) { - rpcrdma_map_one(ia, seg, writing); - physaddrs[i] = seg->mr_dma; - len += seg->mr_len; - ++seg; - ++i; - /* Check for holes */ - if ((i < *nsegs && offset_in_page(seg->mr_offset)) || - offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) - break; - } - rc = ib_map_phys_fmr(seg1->rl_mw->r.fmr, physaddrs, i, seg1->mr_dma); - if (rc) { - dprintk("RPC: %s: failed ib_map_phys_fmr " - "%u@0x%llx+%i (%d)... status %i\n", __func__, - len, (unsigned long long)seg1->mr_dma, - pageoff, i, rc); - while (i--) - rpcrdma_unmap_one(ia, --seg); - } else { - seg1->mr_rkey = seg1->rl_mw->r.fmr->rkey; - seg1->mr_base = seg1->mr_dma + pageoff; - seg1->mr_nsegs = i; - seg1->mr_len = len; - } - *nsegs = i; - return rc; -} - static int rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg, struct rpcrdma_ia *ia) @@ -2066,42 +1940,6 @@ rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg, return rc; } -int -rpcrdma_register_external(struct rpcrdma_mr_seg *seg, - int nsegs, int writing, struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - int rc = 0; - - switch (ia->ri_memreg_strategy) { - - case RPCRDMA_ALLPHYSICAL: - rpcrdma_map_one(ia, seg, writing); - seg->mr_rkey = ia->ri_bind_mem->rkey; - seg->mr_base = seg->mr_dma; - seg->mr_nsegs = 1; - nsegs = 1; - break; - - /* Registration using frmr registration */ - case RPCRDMA_FRMR: - rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt); - break; - - /* Registration using fmr memory registration */ - case RPCRDMA_MTHCAFMR: - rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia); - break; - - default: - return -EIO; - } - if (rc) - return rc; - - return nsegs; -} - int rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, struct rpcrdma_xprt *r_xprt) diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 59e627e96a0b..7bf077bf751d 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -336,6 +336,8 @@ struct rpcrdma_stats { */ struct rpcrdma_xprt; struct rpcrdma_memreg_ops { + int (*ro_map)(struct rpcrdma_xprt *, + struct rpcrdma_mr_seg *, int, bool); size_t (*ro_maxpages)(struct rpcrdma_xprt *); const char *ro_displayname; }; @@ -403,8 +405,6 @@ void rpcrdma_buffer_put(struct rpcrdma_req *); void rpcrdma_recv_buffer_get(struct rpcrdma_req *); void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); -int rpcrdma_register_external(struct rpcrdma_mr_seg *, - int, int, struct rpcrdma_xprt *); int rpcrdma_deregister_external(struct rpcrdma_mr_seg *, struct rpcrdma_xprt *); @@ -414,6 +414,8 @@ void rpcrdma_free_regbuf(struct rpcrdma_ia *, struct rpcrdma_regbuf *); unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *); +void rpcrdma_map_one(struct rpcrdma_ia *, struct rpcrdma_mr_seg *, bool); +void rpcrdma_unmap_one(struct rpcrdma_ia *, struct rpcrdma_mr_seg *); /* * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c -- cgit v1.2.3 From 6814baead86b5d44096ddfbb6f944163578e68c3 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 30 Mar 2015 14:34:48 -0400 Subject: xprtrdma: Add a "deregister_external" op for each memreg mode There is very little common processing among the different external memory deregistration functions. Signed-off-by: Chuck Lever Tested-by: Devesh Sharma Tested-by: Meghana Cheripady Tested-by: Veeresh U. Kokatnur Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/fmr_ops.c | 27 +++++++++++++ net/sunrpc/xprtrdma/frwr_ops.c | 36 +++++++++++++++++ net/sunrpc/xprtrdma/physical_ops.c | 10 +++++ net/sunrpc/xprtrdma/rpc_rdma.c | 11 +++--- net/sunrpc/xprtrdma/transport.c | 4 +- net/sunrpc/xprtrdma/verbs.c | 81 +------------------------------------- net/sunrpc/xprtrdma/xprt_rdma.h | 5 +-- 7 files changed, 84 insertions(+), 90 deletions(-) diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index 45fb646a6d36..888aa107cf0b 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -79,8 +79,35 @@ out_maperr: return rc; } +/* Use the ib_unmap_fmr() verb to prevent further remote + * access via RDMA READ or RDMA WRITE. + */ +static int +fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_mr_seg *seg1 = seg; + int rc, nsegs = seg->mr_nsegs; + LIST_HEAD(l); + + list_add(&seg1->rl_mw->r.fmr->list, &l); + rc = ib_unmap_fmr(&l); + read_lock(&ia->ri_qplock); + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(ia, seg++); + read_unlock(&ia->ri_qplock); + if (rc) + goto out_err; + return nsegs; + +out_err: + dprintk("RPC: %s: ib_unmap_fmr status %i\n", __func__, rc); + return nsegs; +} + const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { .ro_map = fmr_op_map, + .ro_unmap = fmr_op_unmap, .ro_maxpages = fmr_op_maxpages, .ro_displayname = "fmr", }; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 23e4d99a2097..35b725bf0afb 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -110,8 +110,44 @@ out_senderr: return rc; } +/* Post a LOCAL_INV Work Request to prevent further remote access + * via RDMA READ or RDMA WRITE. + */ +static int +frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) +{ + struct rpcrdma_mr_seg *seg1 = seg; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct ib_send_wr invalidate_wr, *bad_wr; + int rc, nsegs = seg->mr_nsegs; + + seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID; + + memset(&invalidate_wr, 0, sizeof(invalidate_wr)); + invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw; + invalidate_wr.opcode = IB_WR_LOCAL_INV; + invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey; + DECR_CQCOUNT(&r_xprt->rx_ep); + + read_lock(&ia->ri_qplock); + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(ia, seg++); + rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); + read_unlock(&ia->ri_qplock); + if (rc) + goto out_err; + return nsegs; + +out_err: + /* Force rpcrdma_buffer_get() to retry */ + seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE; + dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc); + return nsegs; +} + const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { .ro_map = frwr_op_map, + .ro_unmap = frwr_op_unmap, .ro_maxpages = frwr_op_maxpages, .ro_displayname = "frwr", }; diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c index 5a284ee0e058..5b5a63adfdf3 100644 --- a/net/sunrpc/xprtrdma/physical_ops.c +++ b/net/sunrpc/xprtrdma/physical_ops.c @@ -44,8 +44,18 @@ physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, return 1; } +/* Unmap a memory region, but leave it registered. + */ +static int +physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) +{ + rpcrdma_unmap_one(&r_xprt->rx_ia, seg); + return 1; +} + const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = { .ro_map = physical_op_map, + .ro_unmap = physical_op_unmap, .ro_maxpages = physical_op_maxpages, .ro_displayname = "physical", }; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 6ab1d03d7f3e..2c53ea9e1b83 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -284,11 +284,12 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, return (unsigned char *)iptr - (unsigned char *)headerp; out: - if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_FRMR) { - for (pos = 0; nchunks--;) - pos += rpcrdma_deregister_external( - &req->rl_segments[pos], r_xprt); - } + if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR) + return n; + + for (pos = 0; nchunks--;) + pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt, + &req->rl_segments[pos]); return n; } diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index da71a24641e3..54f23b1be986 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -584,8 +584,8 @@ xprt_rdma_free(void *buffer) for (i = 0; req->rl_nchunks;) { --req->rl_nchunks; - i += rpcrdma_deregister_external( - &req->rl_segments[i], r_xprt); + i += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt, + &req->rl_segments[i]); } rpcrdma_buffer_put(req); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 4318c04e095c..b167c99fbfb6 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1510,7 +1510,7 @@ rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) } } -/* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external(). +/* rpcrdma_unmap_one() was already done during deregistration. * Redo only the ib_post_send(). */ static void @@ -1890,85 +1890,6 @@ rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg) seg->mr_dma, seg->mr_dmalen, seg->mr_dir); } -static int -rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, - struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_mr_seg *seg1 = seg; - struct ib_send_wr invalidate_wr, *bad_wr; - int rc; - - seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID; - - memset(&invalidate_wr, 0, sizeof invalidate_wr); - invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw; - invalidate_wr.opcode = IB_WR_LOCAL_INV; - invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey; - DECR_CQCOUNT(&r_xprt->rx_ep); - - read_lock(&ia->ri_qplock); - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); - rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); - read_unlock(&ia->ri_qplock); - if (rc) { - /* Force rpcrdma_buffer_get() to retry */ - seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE; - dprintk("RPC: %s: failed ib_post_send for invalidate," - " status %i\n", __func__, rc); - } - return rc; -} - -static int -rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg, - struct rpcrdma_ia *ia) -{ - struct rpcrdma_mr_seg *seg1 = seg; - LIST_HEAD(l); - int rc; - - list_add(&seg1->rl_mw->r.fmr->list, &l); - rc = ib_unmap_fmr(&l); - read_lock(&ia->ri_qplock); - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); - read_unlock(&ia->ri_qplock); - if (rc) - dprintk("RPC: %s: failed ib_unmap_fmr," - " status %i\n", __func__, rc); - return rc; -} - -int -rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, - struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - int nsegs = seg->mr_nsegs, rc; - - switch (ia->ri_memreg_strategy) { - - case RPCRDMA_ALLPHYSICAL: - read_lock(&ia->ri_qplock); - rpcrdma_unmap_one(ia, seg); - read_unlock(&ia->ri_qplock); - break; - - case RPCRDMA_FRMR: - rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt); - break; - - case RPCRDMA_MTHCAFMR: - rc = rpcrdma_deregister_fmr_external(seg, ia); - break; - - default: - break; - } - return nsegs; -} - /* * Prepost any receive buffer, then post send. * diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 7bf077bf751d..9a727f960df9 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -338,6 +338,8 @@ struct rpcrdma_xprt; struct rpcrdma_memreg_ops { int (*ro_map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool); + int (*ro_unmap)(struct rpcrdma_xprt *, + struct rpcrdma_mr_seg *); size_t (*ro_maxpages)(struct rpcrdma_xprt *); const char *ro_displayname; }; @@ -405,9 +407,6 @@ void rpcrdma_buffer_put(struct rpcrdma_req *); void rpcrdma_recv_buffer_get(struct rpcrdma_req *); void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); -int rpcrdma_deregister_external(struct rpcrdma_mr_seg *, - struct rpcrdma_xprt *); - struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *, size_t, gfp_t); void rpcrdma_free_regbuf(struct rpcrdma_ia *, -- cgit v1.2.3 From 91e70e70e47b3355bb0a8b3b196c93897dcdb440 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 30 Mar 2015 14:34:58 -0400 Subject: xprtrdma: Add "init MRs" memreg op This method is used when setting up a new transport instance to create a pool of Memory Region objects that will be used to register memory during operation. Memory Regions are not needed for "physical" registration, since ->prepare and ->release are no-ops for that mode. Signed-off-by: Chuck Lever Reviewed-by: Sagi Grimberg Tested-by: Devesh Sharma Tested-by: Meghana Cheripady Tested-by: Veeresh U. Kokatnur Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/fmr_ops.c | 42 +++++++++++++++ net/sunrpc/xprtrdma/frwr_ops.c | 66 +++++++++++++++++++++++ net/sunrpc/xprtrdma/physical_ops.c | 7 +++ net/sunrpc/xprtrdma/verbs.c | 104 ++----------------------------------- net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 5 files changed, 119 insertions(+), 101 deletions(-) diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index 888aa107cf0b..825ce9641350 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -29,6 +29,47 @@ fmr_op_maxpages(struct rpcrdma_xprt *r_xprt) rpcrdma_max_segments(r_xprt) * RPCRDMA_MAX_FMR_SGES); } +static int +fmr_op_init(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; + struct ib_fmr_attr fmr_attr = { + .max_pages = RPCRDMA_MAX_FMR_SGES, + .max_maps = 1, + .page_shift = PAGE_SHIFT + }; + struct ib_pd *pd = r_xprt->rx_ia.ri_pd; + struct rpcrdma_mw *r; + int i, rc; + + INIT_LIST_HEAD(&buf->rb_mws); + INIT_LIST_HEAD(&buf->rb_all); + + i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; + dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i); + + while (i--) { + r = kzalloc(sizeof(*r), GFP_KERNEL); + if (!r) + return -ENOMEM; + + r->r.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr); + if (IS_ERR(r->r.fmr)) + goto out_fmr_err; + + list_add(&r->mw_list, &buf->rb_mws); + list_add(&r->mw_all, &buf->rb_all); + } + return 0; + +out_fmr_err: + rc = PTR_ERR(r->r.fmr); + dprintk("RPC: %s: ib_alloc_fmr status %i\n", __func__, rc); + kfree(r); + return rc; +} + /* Use the ib_map_phys_fmr() verb to register a memory region * for remote access via RDMA READ or RDMA WRITE. */ @@ -109,5 +150,6 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { .ro_map = fmr_op_map, .ro_unmap = fmr_op_unmap, .ro_maxpages = fmr_op_maxpages, + .ro_init = fmr_op_init, .ro_displayname = "fmr", }; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 35b725bf0afb..9168c15faafa 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -17,6 +17,35 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif +static int +__frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device, + unsigned int depth) +{ + struct rpcrdma_frmr *f = &r->r.frmr; + int rc; + + f->fr_mr = ib_alloc_fast_reg_mr(pd, depth); + if (IS_ERR(f->fr_mr)) + goto out_mr_err; + f->fr_pgl = ib_alloc_fast_reg_page_list(device, depth); + if (IS_ERR(f->fr_pgl)) + goto out_list_err; + return 0; + +out_mr_err: + rc = PTR_ERR(f->fr_mr); + dprintk("RPC: %s: ib_alloc_fast_reg_mr status %i\n", + __func__, rc); + return rc; + +out_list_err: + rc = PTR_ERR(f->fr_pgl); + dprintk("RPC: %s: ib_alloc_fast_reg_page_list status %i\n", + __func__, rc); + ib_dereg_mr(f->fr_mr); + return rc; +} + /* FRWR mode conveys a list of pages per chunk segment. The * maximum length of that list is the FRWR page list depth. */ @@ -29,6 +58,42 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt) rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth); } +static int +frwr_op_init(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct ib_device *device = r_xprt->rx_ia.ri_id->device; + unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth; + struct ib_pd *pd = r_xprt->rx_ia.ri_pd; + int i; + + INIT_LIST_HEAD(&buf->rb_mws); + INIT_LIST_HEAD(&buf->rb_all); + + i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; + dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i); + + while (i--) { + struct rpcrdma_mw *r; + int rc; + + r = kzalloc(sizeof(*r), GFP_KERNEL); + if (!r) + return -ENOMEM; + + rc = __frwr_init(r, pd, device, depth); + if (rc) { + kfree(r); + return rc; + } + + list_add(&r->mw_list, &buf->rb_mws); + list_add(&r->mw_all, &buf->rb_all); + } + + return 0; +} + /* Post a FAST_REG Work Request to register a memory region * for remote access via RDMA READ or RDMA WRITE. */ @@ -149,5 +214,6 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { .ro_map = frwr_op_map, .ro_unmap = frwr_op_unmap, .ro_maxpages = frwr_op_maxpages, + .ro_init = frwr_op_init, .ro_displayname = "frwr", }; diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c index 5b5a63adfdf3..c37205190a2f 100644 --- a/net/sunrpc/xprtrdma/physical_ops.c +++ b/net/sunrpc/xprtrdma/physical_ops.c @@ -28,6 +28,12 @@ physical_op_maxpages(struct rpcrdma_xprt *r_xprt) rpcrdma_max_segments(r_xprt)); } +static int +physical_op_init(struct rpcrdma_xprt *r_xprt) +{ + return 0; +} + /* The client's physical memory is already exposed for * remote access via RDMA READ or RDMA WRITE. */ @@ -57,5 +63,6 @@ const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = { .ro_map = physical_op_map, .ro_unmap = physical_op_unmap, .ro_maxpages = physical_op_maxpages, + .ro_init = physical_op_init, .ro_displayname = "physical", }; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index b167c99fbfb6..e89a57d4e4f2 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1124,91 +1124,6 @@ out: return ERR_PTR(rc); } -static int -rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf) -{ - int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; - struct ib_fmr_attr fmr_attr = { - .max_pages = RPCRDMA_MAX_DATA_SEGS, - .max_maps = 1, - .page_shift = PAGE_SHIFT - }; - struct rpcrdma_mw *r; - int i, rc; - - i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; - dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i); - - while (i--) { - r = kzalloc(sizeof(*r), GFP_KERNEL); - if (r == NULL) - return -ENOMEM; - - r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr); - if (IS_ERR(r->r.fmr)) { - rc = PTR_ERR(r->r.fmr); - dprintk("RPC: %s: ib_alloc_fmr failed %i\n", - __func__, rc); - goto out_free; - } - - list_add(&r->mw_list, &buf->rb_mws); - list_add(&r->mw_all, &buf->rb_all); - } - return 0; - -out_free: - kfree(r); - return rc; -} - -static int -rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf) -{ - struct rpcrdma_frmr *f; - struct rpcrdma_mw *r; - int i, rc; - - i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; - dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i); - - while (i--) { - r = kzalloc(sizeof(*r), GFP_KERNEL); - if (r == NULL) - return -ENOMEM; - f = &r->r.frmr; - - f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, - ia->ri_max_frmr_depth); - if (IS_ERR(f->fr_mr)) { - rc = PTR_ERR(f->fr_mr); - dprintk("RPC: %s: ib_alloc_fast_reg_mr " - "failed %i\n", __func__, rc); - goto out_free; - } - - f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device, - ia->ri_max_frmr_depth); - if (IS_ERR(f->fr_pgl)) { - rc = PTR_ERR(f->fr_pgl); - dprintk("RPC: %s: ib_alloc_fast_reg_page_list " - "failed %i\n", __func__, rc); - - ib_dereg_mr(f->fr_mr); - goto out_free; - } - - list_add(&r->mw_list, &buf->rb_mws); - list_add(&r->mw_all, &buf->rb_all); - } - - return 0; - -out_free: - kfree(r); - return rc; -} - int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) { @@ -1245,22 +1160,9 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) buf->rb_recv_bufs = (struct rpcrdma_rep **) p; p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests]; - INIT_LIST_HEAD(&buf->rb_mws); - INIT_LIST_HEAD(&buf->rb_all); - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: - rc = rpcrdma_init_frmrs(ia, buf); - if (rc) - goto out; - break; - case RPCRDMA_MTHCAFMR: - rc = rpcrdma_init_fmrs(ia, buf); - if (rc) - goto out; - break; - default: - break; - } + rc = ia->ri_ops->ro_init(r_xprt); + if (rc) + goto out; for (i = 0; i < buf->rb_max_requests; i++) { struct rpcrdma_req *req; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 9a727f960df9..90b60feab45a 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -341,6 +341,7 @@ struct rpcrdma_memreg_ops { int (*ro_unmap)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *); size_t (*ro_maxpages)(struct rpcrdma_xprt *); + int (*ro_init)(struct rpcrdma_xprt *); const char *ro_displayname; }; -- cgit v1.2.3 From 31a701a94751509bb72e13d851f18ddcf22ff722 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 30 Mar 2015 14:35:07 -0400 Subject: xprtrdma: Add "reset MRs" memreg op This method is invoked when a transport instance is about to be reconnected. Each Memory Region object is reset to its initial state. Signed-off-by: Chuck Lever Reviewed-by: Sagi Grimberg Tested-by: Devesh Sharma Tested-by: Meghana Cheripady Tested-by: Veeresh U. Kokatnur Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/fmr_ops.c | 23 +++++++++ net/sunrpc/xprtrdma/frwr_ops.c | 51 ++++++++++++++++++ net/sunrpc/xprtrdma/physical_ops.c | 6 +++ net/sunrpc/xprtrdma/verbs.c | 103 +------------------------------------ net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 5 files changed, 83 insertions(+), 101 deletions(-) diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index 825ce9641350..93261b05891e 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -146,10 +146,33 @@ out_err: return nsegs; } +/* After a disconnect, unmap all FMRs. + * + * This is invoked only in the transport connect worker in order + * to serialize with rpcrdma_register_fmr_external(). + */ +static void +fmr_op_reset(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_mw *r; + LIST_HEAD(list); + int rc; + + list_for_each_entry(r, &buf->rb_all, mw_all) + list_add(&r->r.fmr->list, &list); + + rc = ib_unmap_fmr(&list); + if (rc) + dprintk("RPC: %s: ib_unmap_fmr failed %i\n", + __func__, rc); +} + const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { .ro_map = fmr_op_map, .ro_unmap = fmr_op_unmap, .ro_maxpages = fmr_op_maxpages, .ro_init = fmr_op_init, + .ro_reset = fmr_op_reset, .ro_displayname = "fmr", }; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 9168c15faafa..c2bb29d4df3c 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -46,6 +46,18 @@ out_list_err: return rc; } +static void +__frwr_release(struct rpcrdma_mw *r) +{ + int rc; + + rc = ib_dereg_mr(r->r.frmr.fr_mr); + if (rc) + dprintk("RPC: %s: ib_dereg_mr status %i\n", + __func__, rc); + ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); +} + /* FRWR mode conveys a list of pages per chunk segment. The * maximum length of that list is the FRWR page list depth. */ @@ -210,10 +222,49 @@ out_err: return nsegs; } +/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in + * an unusable state. Find FRMRs in this state and dereg / reg + * each. FRMRs that are VALID and attached to an rpcrdma_req are + * also torn down. + * + * This gives all in-use FRMRs a fresh rkey and leaves them INVALID. + * + * This is invoked only in the transport connect worker in order + * to serialize with rpcrdma_register_frmr_external(). + */ +static void +frwr_op_reset(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct ib_device *device = r_xprt->rx_ia.ri_id->device; + unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth; + struct ib_pd *pd = r_xprt->rx_ia.ri_pd; + struct rpcrdma_mw *r; + int rc; + + list_for_each_entry(r, &buf->rb_all, mw_all) { + if (r->r.frmr.fr_state == FRMR_IS_INVALID) + continue; + + __frwr_release(r); + rc = __frwr_init(r, pd, device, depth); + if (rc) { + dprintk("RPC: %s: mw %p left %s\n", + __func__, r, + (r->r.frmr.fr_state == FRMR_IS_STALE ? + "stale" : "valid")); + continue; + } + + r->r.frmr.fr_state = FRMR_IS_INVALID; + } +} + const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { .ro_map = frwr_op_map, .ro_unmap = frwr_op_unmap, .ro_maxpages = frwr_op_maxpages, .ro_init = frwr_op_init, + .ro_reset = frwr_op_reset, .ro_displayname = "frwr", }; diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c index c37205190a2f..e0607136655b 100644 --- a/net/sunrpc/xprtrdma/physical_ops.c +++ b/net/sunrpc/xprtrdma/physical_ops.c @@ -59,10 +59,16 @@ physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) return 1; } +static void +physical_op_reset(struct rpcrdma_xprt *r_xprt) +{ +} + const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = { .ro_map = physical_op_map, .ro_unmap = physical_op_unmap, .ro_maxpages = physical_op_maxpages, .ro_init = physical_op_init, + .ro_reset = physical_op_reset, .ro_displayname = "physical", }; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index e89a57d4e4f2..1b2c1f4ec4c3 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -63,9 +63,6 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif -static void rpcrdma_reset_frmrs(struct rpcrdma_ia *); -static void rpcrdma_reset_fmrs(struct rpcrdma_ia *); - /* * internal functions */ @@ -945,21 +942,9 @@ retry: rpcrdma_ep_disconnect(ep, ia); rpcrdma_flush_cqs(ep); - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: - rpcrdma_reset_frmrs(ia); - break; - case RPCRDMA_MTHCAFMR: - rpcrdma_reset_fmrs(ia); - break; - case RPCRDMA_ALLPHYSICAL: - break; - default: - rc = -EIO; - goto out; - } - xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); + ia->ri_ops->ro_reset(xprt); + id = rpcrdma_create_id(xprt, ia, (struct sockaddr *)&xprt->rx_data.addr); if (IS_ERR(id)) { @@ -1289,90 +1274,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) kfree(buf->rb_pool); } -/* After a disconnect, unmap all FMRs. - * - * This is invoked only in the transport connect worker in order - * to serialize with rpcrdma_register_fmr_external(). - */ -static void -rpcrdma_reset_fmrs(struct rpcrdma_ia *ia) -{ - struct rpcrdma_xprt *r_xprt = - container_of(ia, struct rpcrdma_xprt, rx_ia); - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct list_head *pos; - struct rpcrdma_mw *r; - LIST_HEAD(l); - int rc; - - list_for_each(pos, &buf->rb_all) { - r = list_entry(pos, struct rpcrdma_mw, mw_all); - - INIT_LIST_HEAD(&l); - list_add(&r->r.fmr->list, &l); - rc = ib_unmap_fmr(&l); - if (rc) - dprintk("RPC: %s: ib_unmap_fmr failed %i\n", - __func__, rc); - } -} - -/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in - * an unusable state. Find FRMRs in this state and dereg / reg - * each. FRMRs that are VALID and attached to an rpcrdma_req are - * also torn down. - * - * This gives all in-use FRMRs a fresh rkey and leaves them INVALID. - * - * This is invoked only in the transport connect worker in order - * to serialize with rpcrdma_register_frmr_external(). - */ -static void -rpcrdma_reset_frmrs(struct rpcrdma_ia *ia) -{ - struct rpcrdma_xprt *r_xprt = - container_of(ia, struct rpcrdma_xprt, rx_ia); - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct list_head *pos; - struct rpcrdma_mw *r; - int rc; - - list_for_each(pos, &buf->rb_all) { - r = list_entry(pos, struct rpcrdma_mw, mw_all); - - if (r->r.frmr.fr_state == FRMR_IS_INVALID) - continue; - - rc = ib_dereg_mr(r->r.frmr.fr_mr); - if (rc) - dprintk("RPC: %s: ib_dereg_mr failed %i\n", - __func__, rc); - ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); - - r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, - ia->ri_max_frmr_depth); - if (IS_ERR(r->r.frmr.fr_mr)) { - rc = PTR_ERR(r->r.frmr.fr_mr); - dprintk("RPC: %s: ib_alloc_fast_reg_mr" - " failed %i\n", __func__, rc); - continue; - } - r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list( - ia->ri_id->device, - ia->ri_max_frmr_depth); - if (IS_ERR(r->r.frmr.fr_pgl)) { - rc = PTR_ERR(r->r.frmr.fr_pgl); - dprintk("RPC: %s: " - "ib_alloc_fast_reg_page_list " - "failed %i\n", __func__, rc); - - ib_dereg_mr(r->r.frmr.fr_mr); - continue; - } - r->r.frmr.fr_state = FRMR_IS_INVALID; - } -} - /* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving * some req segments uninitialized. */ diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 90b60feab45a..06802394cf89 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -342,6 +342,7 @@ struct rpcrdma_memreg_ops { struct rpcrdma_mr_seg *); size_t (*ro_maxpages)(struct rpcrdma_xprt *); int (*ro_init)(struct rpcrdma_xprt *); + void (*ro_reset)(struct rpcrdma_xprt *); const char *ro_displayname; }; -- cgit v1.2.3 From 4561f347d49c645fd81d1f47b0fb460e8a6e4587 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 30 Mar 2015 14:35:17 -0400 Subject: xprtrdma: Add "destroy MRs" memreg op Memory Region objects associated with a transport instance are destroyed before the instance is shutdown and destroyed. Signed-off-by: Chuck Lever Reviewed-by: Sagi Grimberg Tested-by: Devesh Sharma Tested-by: Meghana Cheripady Tested-by: Veeresh U. Kokatnur Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/fmr_ops.c | 18 +++++++++++++ net/sunrpc/xprtrdma/frwr_ops.c | 14 ++++++++++ net/sunrpc/xprtrdma/physical_ops.c | 6 +++++ net/sunrpc/xprtrdma/verbs.c | 52 +------------------------------------- net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 5 files changed, 40 insertions(+), 51 deletions(-) diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index 93261b05891e..e9ca5944ac1e 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -168,11 +168,29 @@ fmr_op_reset(struct rpcrdma_xprt *r_xprt) __func__, rc); } +static void +fmr_op_destroy(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_mw *r; + int rc; + + while (!list_empty(&buf->rb_all)) { + r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); + list_del(&r->mw_all); + rc = ib_dealloc_fmr(r->r.fmr); + if (rc) + dprintk("RPC: %s: ib_dealloc_fmr failed %i\n", + __func__, rc); + kfree(r); + } +} + const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { .ro_map = fmr_op_map, .ro_unmap = fmr_op_unmap, .ro_maxpages = fmr_op_maxpages, .ro_init = fmr_op_init, .ro_reset = fmr_op_reset, + .ro_destroy = fmr_op_destroy, .ro_displayname = "fmr", }; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index c2bb29d4df3c..121e400d0565 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -260,11 +260,25 @@ frwr_op_reset(struct rpcrdma_xprt *r_xprt) } } +static void +frwr_op_destroy(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_mw *r; + + while (!list_empty(&buf->rb_all)) { + r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); + list_del(&r->mw_all); + __frwr_release(r); + kfree(r); + } +} + const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { .ro_map = frwr_op_map, .ro_unmap = frwr_op_unmap, .ro_maxpages = frwr_op_maxpages, .ro_init = frwr_op_init, .ro_reset = frwr_op_reset, + .ro_destroy = frwr_op_destroy, .ro_displayname = "frwr", }; diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c index e0607136655b..eb39011e3129 100644 --- a/net/sunrpc/xprtrdma/physical_ops.c +++ b/net/sunrpc/xprtrdma/physical_ops.c @@ -64,11 +64,17 @@ physical_op_reset(struct rpcrdma_xprt *r_xprt) { } +static void +physical_op_destroy(struct rpcrdma_buffer *buf) +{ +} + const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = { .ro_map = physical_op_map, .ro_unmap = physical_op_unmap, .ro_maxpages = physical_op_maxpages, .ro_init = physical_op_init, .ro_reset = physical_op_reset, + .ro_destroy = physical_op_destroy, .ro_displayname = "physical", }; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 1b2c1f4ec4c3..a7fb31441069 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1199,47 +1199,6 @@ rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req) kfree(req); } -static void -rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf) -{ - struct rpcrdma_mw *r; - int rc; - - while (!list_empty(&buf->rb_all)) { - r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); - list_del(&r->mw_all); - list_del(&r->mw_list); - - rc = ib_dealloc_fmr(r->r.fmr); - if (rc) - dprintk("RPC: %s: ib_dealloc_fmr failed %i\n", - __func__, rc); - - kfree(r); - } -} - -static void -rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf) -{ - struct rpcrdma_mw *r; - int rc; - - while (!list_empty(&buf->rb_all)) { - r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); - list_del(&r->mw_all); - list_del(&r->mw_list); - - rc = ib_dereg_mr(r->r.frmr.fr_mr); - if (rc) - dprintk("RPC: %s: ib_dereg_mr failed %i\n", - __func__, rc); - ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); - - kfree(r); - } -} - void rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { @@ -1260,16 +1219,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]); } - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: - rpcrdma_destroy_frmrs(buf); - break; - case RPCRDMA_MTHCAFMR: - rpcrdma_destroy_fmrs(buf); - break; - default: - break; - } + ia->ri_ops->ro_destroy(buf); kfree(buf->rb_pool); } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 06802394cf89..b95e223d3d69 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -343,6 +343,7 @@ struct rpcrdma_memreg_ops { size_t (*ro_maxpages)(struct rpcrdma_xprt *); int (*ro_init)(struct rpcrdma_xprt *); void (*ro_reset)(struct rpcrdma_xprt *); + void (*ro_destroy)(struct rpcrdma_buffer *); const char *ro_displayname; }; -- cgit v1.2.3 From 3968cb58501bf526eed1441f4ef237028aa9cd2d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 30 Mar 2015 14:35:26 -0400 Subject: xprtrdma: Add "open" memreg op The open op determines the size of various transport data structures based on device capabilities and memory registration mode. Signed-off-by: Chuck Lever Tested-by: Devesh Sharma Tested-by: Meghana Cheripady Tested-by: Veeresh U. Kokatnur Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/fmr_ops.c | 8 +++++++ net/sunrpc/xprtrdma/frwr_ops.c | 48 +++++++++++++++++++++++++++++++++++++ net/sunrpc/xprtrdma/physical_ops.c | 8 +++++++ net/sunrpc/xprtrdma/verbs.c | 49 +++----------------------------------- net/sunrpc/xprtrdma/xprt_rdma.h | 3 +++ 5 files changed, 70 insertions(+), 46 deletions(-) diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index e9ca5944ac1e..e8a9837f8d63 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -20,6 +20,13 @@ /* Maximum scatter/gather per FMR */ #define RPCRDMA_MAX_FMR_SGES (64) +static int +fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, + struct rpcrdma_create_data_internal *cdata) +{ + return 0; +} + /* FMR mode conveys up to 64 pages of payload per chunk segment. */ static size_t @@ -188,6 +195,7 @@ fmr_op_destroy(struct rpcrdma_buffer *buf) const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { .ro_map = fmr_op_map, .ro_unmap = fmr_op_unmap, + .ro_open = fmr_op_open, .ro_maxpages = fmr_op_maxpages, .ro_init = fmr_op_init, .ro_reset = fmr_op_reset, diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 121e400d0565..e17d54d473a7 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -58,6 +58,53 @@ __frwr_release(struct rpcrdma_mw *r) ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); } +static int +frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, + struct rpcrdma_create_data_internal *cdata) +{ + struct ib_device_attr *devattr = &ia->ri_devattr; + int depth, delta; + + ia->ri_max_frmr_depth = + min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, + devattr->max_fast_reg_page_list_len); + dprintk("RPC: %s: device's max FR page list len = %u\n", + __func__, ia->ri_max_frmr_depth); + + /* Add room for frmr register and invalidate WRs. + * 1. FRMR reg WR for head + * 2. FRMR invalidate WR for head + * 3. N FRMR reg WRs for pagelist + * 4. N FRMR invalidate WRs for pagelist + * 5. FRMR reg WR for tail + * 6. FRMR invalidate WR for tail + * 7. The RDMA_SEND WR + */ + depth = 7; + + /* Calculate N if the device max FRMR depth is smaller than + * RPCRDMA_MAX_DATA_SEGS. + */ + if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) { + delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frmr_depth; + do { + depth += 2; /* FRMR reg + invalidate */ + delta -= ia->ri_max_frmr_depth; + } while (delta > 0); + } + + ep->rep_attr.cap.max_send_wr *= depth; + if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) { + cdata->max_requests = devattr->max_qp_wr / depth; + if (!cdata->max_requests) + return -EINVAL; + ep->rep_attr.cap.max_send_wr = cdata->max_requests * + depth; + } + + return 0; +} + /* FRWR mode conveys a list of pages per chunk segment. The * maximum length of that list is the FRWR page list depth. */ @@ -276,6 +323,7 @@ frwr_op_destroy(struct rpcrdma_buffer *buf) const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { .ro_map = frwr_op_map, .ro_unmap = frwr_op_unmap, + .ro_open = frwr_op_open, .ro_maxpages = frwr_op_maxpages, .ro_init = frwr_op_init, .ro_reset = frwr_op_reset, diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c index eb39011e3129..0ba130bed1fc 100644 --- a/net/sunrpc/xprtrdma/physical_ops.c +++ b/net/sunrpc/xprtrdma/physical_ops.c @@ -19,6 +19,13 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif +static int +physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, + struct rpcrdma_create_data_internal *cdata) +{ + return 0; +} + /* PHYSICAL memory registration conveys one page per chunk segment. */ static size_t @@ -72,6 +79,7 @@ physical_op_destroy(struct rpcrdma_buffer *buf) const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = { .ro_map = physical_op_map, .ro_unmap = physical_op_unmap, + .ro_open = physical_op_open, .ro_maxpages = physical_op_maxpages, .ro_init = physical_op_init, .ro_reset = physical_op_reset, diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index a7fb31441069..b697b3ed6273 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -622,11 +622,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) dprintk("RPC: %s: FRMR registration " "not supported by HCA\n", __func__); memreg = RPCRDMA_MTHCAFMR; - } else { - /* Mind the ia limit on FRMR page list depth */ - ia->ri_max_frmr_depth = min_t(unsigned int, - RPCRDMA_MAX_DATA_SEGS, - devattr->max_fast_reg_page_list_len); } } if (memreg == RPCRDMA_MTHCAFMR) { @@ -741,49 +736,11 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall; ep->rep_attr.qp_context = ep; - /* send_cq and recv_cq initialized below */ ep->rep_attr.srq = NULL; ep->rep_attr.cap.max_send_wr = cdata->max_requests; - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: { - int depth = 7; - - /* Add room for frmr register and invalidate WRs. - * 1. FRMR reg WR for head - * 2. FRMR invalidate WR for head - * 3. N FRMR reg WRs for pagelist - * 4. N FRMR invalidate WRs for pagelist - * 5. FRMR reg WR for tail - * 6. FRMR invalidate WR for tail - * 7. The RDMA_SEND WR - */ - - /* Calculate N if the device max FRMR depth is smaller than - * RPCRDMA_MAX_DATA_SEGS. - */ - if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) { - int delta = RPCRDMA_MAX_DATA_SEGS - - ia->ri_max_frmr_depth; - - do { - depth += 2; /* FRMR reg + invalidate */ - delta -= ia->ri_max_frmr_depth; - } while (delta > 0); - - } - ep->rep_attr.cap.max_send_wr *= depth; - if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) { - cdata->max_requests = devattr->max_qp_wr / depth; - if (!cdata->max_requests) - return -EINVAL; - ep->rep_attr.cap.max_send_wr = cdata->max_requests * - depth; - } - break; - } - default: - break; - } + rc = ia->ri_ops->ro_open(ia, ep, cdata); + if (rc) + return rc; ep->rep_attr.cap.max_recv_wr = cdata->max_requests; ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2); ep->rep_attr.cap.max_recv_sge = 1; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index b95e223d3d69..9036fb4174d5 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -340,6 +340,9 @@ struct rpcrdma_memreg_ops { struct rpcrdma_mr_seg *, int, bool); int (*ro_unmap)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *); + int (*ro_open)(struct rpcrdma_ia *, + struct rpcrdma_ep *, + struct rpcrdma_create_data_internal *); size_t (*ro_maxpages)(struct rpcrdma_xprt *); int (*ro_init)(struct rpcrdma_xprt *); void (*ro_reset)(struct rpcrdma_xprt *); -- cgit v1.2.3 From e46ac34c3c34e408435656a5fed605c4c787d081 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 30 Mar 2015 14:35:35 -0400 Subject: xprtrdma: Handle non-SEND completions via a callout Allow each memory registration mode to plug in a callout that handles the completion of a memory registration operation. Signed-off-by: Chuck Lever Reviewed-by: Sagi Grimberg Tested-by: Devesh Sharma Tested-by: Meghana Cheripady Tested-by: Veeresh U. Kokatnur Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 17 +++++++++++++++++ net/sunrpc/xprtrdma/verbs.c | 16 ++++++---------- net/sunrpc/xprtrdma/xprt_rdma.h | 5 +++++ 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index e17d54d473a7..ea59c1b435ff 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -117,6 +117,22 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt) rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth); } +/* If FAST_REG or LOCAL_INV failed, indicate the frmr needs to be reset. */ +static void +frwr_sendcompletion(struct ib_wc *wc) +{ + struct rpcrdma_mw *r; + + if (likely(wc->status == IB_WC_SUCCESS)) + return; + + /* WARNING: Only wr_id and status are reliable at this point */ + r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; + dprintk("RPC: %s: frmr %p (stale), status %d\n", + __func__, r, wc->status); + r->r.frmr.fr_state = FRMR_IS_STALE; +} + static int frwr_op_init(struct rpcrdma_xprt *r_xprt) { @@ -148,6 +164,7 @@ frwr_op_init(struct rpcrdma_xprt *r_xprt) list_add(&r->mw_list, &buf->rb_mws); list_add(&r->mw_all, &buf->rb_all); + r->mw_sendcompletion = frwr_sendcompletion; } return 0; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index b697b3ed6273..cac06f290a26 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -186,7 +186,7 @@ static const char * const wc_status[] = { "remote access error", "remote operation error", "transport retry counter exceeded", - "RNR retrycounter exceeded", + "RNR retry counter exceeded", "local RDD violation error", "remove invalid RD request", "operation aborted", @@ -204,21 +204,17 @@ static const char * const wc_status[] = { static void rpcrdma_sendcq_process_wc(struct ib_wc *wc) { - if (likely(wc->status == IB_WC_SUCCESS)) - return; - /* WARNING: Only wr_id and status are reliable at this point */ - if (wc->wr_id == 0ULL) { - if (wc->status != IB_WC_WR_FLUSH_ERR) + if (wc->wr_id == RPCRDMA_IGNORE_COMPLETION) { + if (wc->status != IB_WC_SUCCESS && + wc->status != IB_WC_WR_FLUSH_ERR) pr_err("RPC: %s: SEND: %s\n", __func__, COMPLETION_MSG(wc->status)); } else { struct rpcrdma_mw *r; r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; - r->r.frmr.fr_state = FRMR_IS_STALE; - pr_err("RPC: %s: frmr %p (stale): %s\n", - __func__, r, COMPLETION_MSG(wc->status)); + r->mw_sendcompletion(wc); } } @@ -1622,7 +1618,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, } send_wr.next = NULL; - send_wr.wr_id = 0ULL; /* no send cookie */ + send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION; send_wr.sg_list = req->rl_send_iov; send_wr.num_sge = req->rl_niovs; send_wr.opcode = IB_WR_SEND; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 9036fb4174d5..54bcbe47075d 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -106,6 +106,10 @@ struct rpcrdma_ep { #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) #define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount) +/* Force completion handler to ignore the signal + */ +#define RPCRDMA_IGNORE_COMPLETION (0ULL) + /* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV * * The below structure appears at the front of a large region of kmalloc'd @@ -206,6 +210,7 @@ struct rpcrdma_mw { struct ib_fmr *fmr; struct rpcrdma_frmr frmr; } r; + void (*mw_sendcompletion)(struct ib_wc *); struct list_head mw_list; struct list_head mw_all; }; -- cgit v1.2.3 From d654788e98f74f2df8dfc6079fa314938f739486 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 30 Mar 2015 14:35:44 -0400 Subject: xprtrdma: Make rpcrdma_{un}map_one() into inline functions These functions are called in a loop for each page transferred via RDMA READ or WRITE. Extract loop invariants and inline them to reduce CPU overhead. Signed-off-by: Chuck Lever Tested-by: Devesh Sharma Tested-by: Meghana Cheripady Tested-by: Veeresh U. Kokatnur Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/fmr_ops.c | 10 ++++++--- net/sunrpc/xprtrdma/frwr_ops.c | 10 ++++++--- net/sunrpc/xprtrdma/physical_ops.c | 10 +++++++-- net/sunrpc/xprtrdma/verbs.c | 44 +++++++------------------------------ net/sunrpc/xprtrdma/xprt_rdma.h | 45 ++++++++++++++++++++++++++++++++++++-- 5 files changed, 73 insertions(+), 46 deletions(-) diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index e8a9837f8d63..a91ba2c8ef1e 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -85,6 +85,8 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, int nsegs, bool writing) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct ib_device *device = ia->ri_id->device; + enum dma_data_direction direction = rpcrdma_data_dir(writing); struct rpcrdma_mr_seg *seg1 = seg; struct rpcrdma_mw *mw = seg1->rl_mw; u64 physaddrs[RPCRDMA_MAX_DATA_SEGS]; @@ -97,7 +99,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, if (nsegs > RPCRDMA_MAX_FMR_SGES) nsegs = RPCRDMA_MAX_FMR_SGES; for (i = 0; i < nsegs;) { - rpcrdma_map_one(ia, seg, writing); + rpcrdma_map_one(device, seg, direction); physaddrs[i] = seg->mr_dma; len += seg->mr_len; ++seg; @@ -123,7 +125,7 @@ out_maperr: __func__, len, (unsigned long long)seg1->mr_dma, pageoff, i, rc); while (i--) - rpcrdma_unmap_one(ia, --seg); + rpcrdma_unmap_one(device, --seg); return rc; } @@ -135,14 +137,16 @@ fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_mr_seg *seg1 = seg; + struct ib_device *device; int rc, nsegs = seg->mr_nsegs; LIST_HEAD(l); list_add(&seg1->rl_mw->r.fmr->list, &l); rc = ib_unmap_fmr(&l); read_lock(&ia->ri_qplock); + device = ia->ri_id->device; while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); + rpcrdma_unmap_one(device, seg++); read_unlock(&ia->ri_qplock); if (rc) goto out_err; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index ea59c1b435ff..0a7b9df70133 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -178,6 +178,8 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, int nsegs, bool writing) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct ib_device *device = ia->ri_id->device; + enum dma_data_direction direction = rpcrdma_data_dir(writing); struct rpcrdma_mr_seg *seg1 = seg; struct rpcrdma_mw *mw = seg1->rl_mw; struct rpcrdma_frmr *frmr = &mw->r.frmr; @@ -197,7 +199,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, if (nsegs > ia->ri_max_frmr_depth) nsegs = ia->ri_max_frmr_depth; for (page_no = i = 0; i < nsegs;) { - rpcrdma_map_one(ia, seg, writing); + rpcrdma_map_one(device, seg, direction); pa = seg->mr_dma; for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) { frmr->fr_pgl->page_list[page_no++] = pa; @@ -247,7 +249,7 @@ out_senderr: ib_update_fast_reg_key(mr, --key); frmr->fr_state = FRMR_IS_INVALID; while (i--) - rpcrdma_unmap_one(ia, --seg); + rpcrdma_unmap_one(device, --seg); return rc; } @@ -261,6 +263,7 @@ frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct ib_send_wr invalidate_wr, *bad_wr; int rc, nsegs = seg->mr_nsegs; + struct ib_device *device; seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID; @@ -271,8 +274,9 @@ frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) DECR_CQCOUNT(&r_xprt->rx_ep); read_lock(&ia->ri_qplock); + device = ia->ri_id->device; while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); + rpcrdma_unmap_one(device, seg++); rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); read_unlock(&ia->ri_qplock); if (rc) diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c index 0ba130bed1fc..ba518af16787 100644 --- a/net/sunrpc/xprtrdma/physical_ops.c +++ b/net/sunrpc/xprtrdma/physical_ops.c @@ -50,7 +50,8 @@ physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, { struct rpcrdma_ia *ia = &r_xprt->rx_ia; - rpcrdma_map_one(ia, seg, writing); + rpcrdma_map_one(ia->ri_id->device, seg, + rpcrdma_data_dir(writing)); seg->mr_rkey = ia->ri_bind_mem->rkey; seg->mr_base = seg->mr_dma; seg->mr_nsegs = 1; @@ -62,7 +63,12 @@ physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, static int physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) { - rpcrdma_unmap_one(&r_xprt->rx_ia, seg); + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + + read_lock(&ia->ri_qplock); + rpcrdma_unmap_one(ia->ri_id->device, seg); + read_unlock(&ia->ri_qplock); + return 1; } diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index cac06f290a26..4870d272e006 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1436,6 +1436,14 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) * Wrappers for internal-use kmalloc memory registration, used by buffer code. */ +void +rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg) +{ + dprintk("RPC: map_one: offset %p iova %llx len %zu\n", + seg->mr_offset, + (unsigned long long)seg->mr_dma, seg->mr_dmalen); +} + static int rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, struct ib_mr **mrp, struct ib_sge *iov) @@ -1560,42 +1568,6 @@ rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) } } -/* - * Wrappers for chunk registration, shared by read/write chunk code. - */ - -void -rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, bool writing) -{ - seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; - seg->mr_dmalen = seg->mr_len; - if (seg->mr_page) - seg->mr_dma = ib_dma_map_page(ia->ri_id->device, - seg->mr_page, offset_in_page(seg->mr_offset), - seg->mr_dmalen, seg->mr_dir); - else - seg->mr_dma = ib_dma_map_single(ia->ri_id->device, - seg->mr_offset, - seg->mr_dmalen, seg->mr_dir); - if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) { - dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n", - __func__, - (unsigned long long)seg->mr_dma, - seg->mr_offset, seg->mr_dmalen); - } -} - -void -rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg) -{ - if (seg->mr_page) - ib_dma_unmap_page(ia->ri_id->device, - seg->mr_dma, seg->mr_dmalen, seg->mr_dir); - else - ib_dma_unmap_single(ia->ri_id->device, - seg->mr_dma, seg->mr_dmalen, seg->mr_dir); -} - /* * Prepost any receive buffer, then post send. * diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 54bcbe47075d..78e0b8beaa36 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -424,8 +424,49 @@ void rpcrdma_free_regbuf(struct rpcrdma_ia *, struct rpcrdma_regbuf *); unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *); -void rpcrdma_map_one(struct rpcrdma_ia *, struct rpcrdma_mr_seg *, bool); -void rpcrdma_unmap_one(struct rpcrdma_ia *, struct rpcrdma_mr_seg *); + +/* + * Wrappers for chunk registration, shared by read/write chunk code. + */ + +void rpcrdma_mapping_error(struct rpcrdma_mr_seg *); + +static inline enum dma_data_direction +rpcrdma_data_dir(bool writing) +{ + return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; +} + +static inline void +rpcrdma_map_one(struct ib_device *device, struct rpcrdma_mr_seg *seg, + enum dma_data_direction direction) +{ + seg->mr_dir = direction; + seg->mr_dmalen = seg->mr_len; + + if (seg->mr_page) + seg->mr_dma = ib_dma_map_page(device, + seg->mr_page, offset_in_page(seg->mr_offset), + seg->mr_dmalen, seg->mr_dir); + else + seg->mr_dma = ib_dma_map_single(device, + seg->mr_offset, + seg->mr_dmalen, seg->mr_dir); + + if (ib_dma_mapping_error(device, seg->mr_dma)) + rpcrdma_mapping_error(seg); +} + +static inline void +rpcrdma_unmap_one(struct ib_device *device, struct rpcrdma_mr_seg *seg) +{ + if (seg->mr_page) + ib_dma_unmap_page(device, + seg->mr_dma, seg->mr_dmalen, seg->mr_dir); + else + ib_dma_unmap_single(device, + seg->mr_dma, seg->mr_dmalen, seg->mr_dir); +} /* * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c -- cgit v1.2.3 From 9a51940bf65bf9fdc93027d70bdecdfc403c5b24 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Mon, 16 Mar 2015 14:06:23 -0400 Subject: NFS: Don't zap caches on fallocate() This patch adds a GETATTR to the end of ALLOCATE and DEALLOCATE operations so we can set the updated inode size and change attribute directly. DEALLOCATE will still need to release pagecache pages, so nfs42_proc_deallocate() now calls truncate_pagecache_range() before contacting the server. Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 1 - fs/nfs/nfs42proc.c | 23 +++++++++++++++++++---- fs/nfs/nfs42xdr.c | 20 ++++++++++++++++---- fs/nfs/nfs4file.c | 1 - include/linux/nfs_xdr.h | 4 ++++ 5 files changed, 39 insertions(+), 10 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 0c3be2658546..83743a133204 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -199,7 +199,6 @@ void nfs_zap_caches(struct inode *inode) nfs_zap_caches_locked(inode); spin_unlock(&inode->i_lock); } -EXPORT_SYMBOL_GPL(nfs_zap_caches); void nfs_zap_mapping(struct inode *inode, struct address_space *mapping) { diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index cb170722769c..b9aa6bbcc8ed 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -36,13 +36,16 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, loff_t offset, loff_t len) { struct inode *inode = file_inode(filep); + struct nfs_server *server = NFS_SERVER(inode); struct nfs42_falloc_args args = { .falloc_fh = NFS_FH(inode), .falloc_offset = offset, .falloc_length = len, + .falloc_bitmask = server->cache_consistency_bitmask, + }; + struct nfs42_falloc_res res = { + .falloc_server = server, }; - struct nfs42_falloc_res res; - struct nfs_server *server = NFS_SERVER(inode); int status; msg->rpc_argp = &args; @@ -52,8 +55,17 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, if (status) return status; - return nfs4_call_sync(server->client, server, msg, - &args.seq_args, &res.seq_res, 0); + res.falloc_fattr = nfs_alloc_fattr(); + if (!res.falloc_fattr) + return -ENOMEM; + + status = nfs4_call_sync(server->client, server, msg, + &args.seq_args, &res.seq_res, 0); + if (status == 0) + status = nfs_post_op_update_inode(inode, res.falloc_fattr); + + kfree(res.falloc_fattr); + return status; } static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, @@ -101,7 +113,10 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE)) return -EOPNOTSUPP; + nfs_wb_all(inode); err = nfs42_proc_fallocate(&msg, filep, offset, len); + if (err == 0) + truncate_pagecache_range(inode, offset, (offset + len) -1); if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE; return err; diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 038a7e1521fa..1a25b27248f2 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -25,16 +25,20 @@ #define NFS4_enc_allocate_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ - encode_allocate_maxsz) + encode_allocate_maxsz + \ + encode_getattr_maxsz) #define NFS4_dec_allocate_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ - decode_allocate_maxsz) + decode_allocate_maxsz + \ + decode_getattr_maxsz) #define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ - encode_deallocate_maxsz) + encode_deallocate_maxsz + \ + encode_getattr_maxsz) #define NFS4_dec_deallocate_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ - decode_deallocate_maxsz) + decode_deallocate_maxsz + \ + decode_getattr_maxsz) #define NFS4_enc_seek_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ encode_seek_maxsz) @@ -92,6 +96,7 @@ static void nfs4_xdr_enc_allocate(struct rpc_rqst *req, encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->falloc_fh, &hdr); encode_allocate(xdr, args, &hdr); + encode_getfattr(xdr, args->falloc_bitmask, &hdr); encode_nops(&hdr); } @@ -110,6 +115,7 @@ static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req, encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->falloc_fh, &hdr); encode_deallocate(xdr, args, &hdr); + encode_getfattr(xdr, args->falloc_bitmask, &hdr); encode_nops(&hdr); } @@ -183,6 +189,9 @@ static int nfs4_xdr_dec_allocate(struct rpc_rqst *rqstp, if (status) goto out; status = decode_allocate(xdr, res); + if (status) + goto out; + decode_getfattr(xdr, res->falloc_fattr, res->falloc_server); out: return status; } @@ -207,6 +216,9 @@ static int nfs4_xdr_dec_deallocate(struct rpc_rqst *rqstp, if (status) goto out; status = decode_deallocate(xdr, res); + if (status) + goto out; + decode_getfattr(xdr, res->falloc_fattr, res->falloc_server); out: return status; } diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 866842b048ef..151ddff624d4 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -165,7 +165,6 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t ret = nfs42_proc_allocate(filep, offset, len); mutex_unlock(&inode->i_mutex); - nfs_zap_caches(inode); return ret; } #endif /* CONFIG_NFS_V4_2 */ diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 3d88908fd140..93ab6071bbe9 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1273,11 +1273,15 @@ struct nfs42_falloc_args { nfs4_stateid falloc_stateid; u64 falloc_offset; u64 falloc_length; + const u32 *falloc_bitmask; }; struct nfs42_falloc_res { struct nfs4_sequence_res seq_res; unsigned int status; + + struct nfs_fattr *falloc_fattr; + const struct nfs_server *falloc_server; }; struct nfs42_seek_args { -- cgit v1.2.3 From f830f7ddd9165c8bd69127458627f03df4b1a406 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Mon, 16 Mar 2015 14:06:24 -0400 Subject: NFS: Reduce time spent holding the i_mutex during fallocate() At the very least, we should not be taking the i_mutex until after checking if the server even supports ALLOCATE or DEALLOCATE, allowing v4.0 or v4.1 to exit without potentially waiting on a lock. Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/nfs42proc.c | 8 ++++++++ fs/nfs/nfs4file.c | 9 ++------- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index b9aa6bbcc8ed..3a9e75235f30 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -96,9 +96,13 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len) if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE)) return -EOPNOTSUPP; + mutex_lock(&inode->i_mutex); + err = nfs42_proc_fallocate(&msg, filep, offset, len); if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE; + + mutex_unlock(&inode->i_mutex); return err; } @@ -114,11 +118,15 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) return -EOPNOTSUPP; nfs_wb_all(inode); + mutex_lock(&inode->i_mutex); + err = nfs42_proc_fallocate(&msg, filep, offset, len); if (err == 0) truncate_pagecache_range(inode, offset, (offset + len) -1); if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE; + + mutex_unlock(&inode->i_mutex); return err; } diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 151ddff624d4..cb3c7879e59f 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -158,14 +158,9 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t if (ret < 0) return ret; - mutex_lock(&inode->i_mutex); if (mode & FALLOC_FL_PUNCH_HOLE) - ret = nfs42_proc_deallocate(filep, offset, len); - else - ret = nfs42_proc_allocate(filep, offset, len); - mutex_unlock(&inode->i_mutex); - - return ret; + return nfs42_proc_deallocate(filep, offset, len); + return nfs42_proc_allocate(filep, offset, len); } #endif /* CONFIG_NFS_V4_2 */ -- cgit v1.2.3 From 5d05e54af3cdbb13cf19c557ff2184781b91a22c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 20 Mar 2015 15:15:14 -0400 Subject: nfs: fix high load average due to callback thread sleeping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Chuck pointed out a problem that crept in with commit 6ffa30d3f734 (nfs: don't call blocking operations while !TASK_RUNNING). Linux counts tasks in uninterruptible sleep against the load average, so this caused the system's load average to be pinned at at least 1 when there was a NFSv4.1+ mount active. Not a huge problem, but it's probably worth fixing before we get too many complaints about it. This patch converts the code back to use TASK_INTERRUPTIBLE sleep, simply has it flush any signals on each loop iteration. In practice no one should really be signalling this thread at all, so I think this is reasonably safe. With this change, there's also no need to game the hung task watchdog so we can also convert the schedule_timeout call back to a normal schedule. Cc: Reported-by: Chuck Lever Signed-off-by: Jeff Layton Tested-by: Chuck Lever Fixes: commit 6ffa30d3f734 (“nfs: don't call blocking . . .”) Signed-off-by: Trond Myklebust --- fs/nfs/callback.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 351be9205bf8..8d129bb7355a 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -128,7 +128,7 @@ nfs41_callback_svc(void *vrqstp) if (try_to_freeze()) continue; - prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_UNINTERRUPTIBLE); + prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); spin_lock_bh(&serv->sv_cb_lock); if (!list_empty(&serv->sv_cb_list)) { req = list_first_entry(&serv->sv_cb_list, @@ -142,10 +142,10 @@ nfs41_callback_svc(void *vrqstp) error); } else { spin_unlock_bh(&serv->sv_cb_lock); - /* schedule_timeout to game the hung task watchdog */ - schedule_timeout(60 * HZ); + schedule(); finish_wait(&serv->sv_cb_waitq, &wq); } + flush_signals(current); } return 0; } -- cgit v1.2.3 From 3f9400981691f6845e5c22b962500742b80a5484 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 31 Mar 2015 12:03:28 -0400 Subject: sunrpc: make debugfs file creation failure non-fatal v2: gracefully handle the case where some dentry pointers end up NULL and be more dilligent about zeroing out dentry pointers We currently have a problem that SELinux policy is being enforced when creating debugfs files. If a debugfs file is created as a side effect of doing some syscall, then that creation can fail if the SELinux policy for that process prevents it. This seems wrong. We don't do that for files under /proc, for instance, so Bruce has proposed a patch to fix that. While discussing that patch however, Greg K.H. stated: "No kernel code should care / fail if a debugfs function fails, so please fix up the sunrpc code first." This patch converts all of the sunrpc debugfs setup code to be void return functins, and the callers to not look for errors from those functions. This should allow rpc_clnt and rpc_xprt creation to work, even if the kernel fails to create debugfs files for some reason. Cc: Greg Kroah-Hartman Acked-by: "J. Bruce Fields" Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- include/linux/sunrpc/debug.h | 18 +++++++-------- net/sunrpc/clnt.c | 4 +--- net/sunrpc/debugfs.c | 52 ++++++++++++++++++++++++-------------------- net/sunrpc/sunrpc_syms.c | 7 +----- net/sunrpc/xprt.c | 7 +----- 5 files changed, 41 insertions(+), 47 deletions(-) diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h index c57d8ea0716c..59a7889e15db 100644 --- a/include/linux/sunrpc/debug.h +++ b/include/linux/sunrpc/debug.h @@ -60,17 +60,17 @@ struct rpc_xprt; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) void rpc_register_sysctl(void); void rpc_unregister_sysctl(void); -int sunrpc_debugfs_init(void); +void sunrpc_debugfs_init(void); void sunrpc_debugfs_exit(void); -int rpc_clnt_debugfs_register(struct rpc_clnt *); +void rpc_clnt_debugfs_register(struct rpc_clnt *); void rpc_clnt_debugfs_unregister(struct rpc_clnt *); -int rpc_xprt_debugfs_register(struct rpc_xprt *); +void rpc_xprt_debugfs_register(struct rpc_xprt *); void rpc_xprt_debugfs_unregister(struct rpc_xprt *); #else -static inline int +static inline void sunrpc_debugfs_init(void) { - return 0; + return; } static inline void @@ -79,10 +79,10 @@ sunrpc_debugfs_exit(void) return; } -static inline int +static inline void rpc_clnt_debugfs_register(struct rpc_clnt *clnt) { - return 0; + return; } static inline void @@ -91,10 +91,10 @@ rpc_clnt_debugfs_unregister(struct rpc_clnt *clnt) return; } -static inline int +static inline void rpc_xprt_debugfs_register(struct rpc_xprt *xprt) { - return 0; + return; } static inline void diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 612aa73bbc60..e6ce1517367f 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -303,9 +303,7 @@ static int rpc_client_register(struct rpc_clnt *clnt, struct super_block *pipefs_sb; int err; - err = rpc_clnt_debugfs_register(clnt); - if (err) - return err; + rpc_clnt_debugfs_register(clnt); pipefs_sb = rpc_get_sb_net(net); if (pipefs_sb) { diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c index e811f390f9f6..82962f7e6e88 100644 --- a/net/sunrpc/debugfs.c +++ b/net/sunrpc/debugfs.c @@ -129,48 +129,52 @@ static const struct file_operations tasks_fops = { .release = tasks_release, }; -int +void rpc_clnt_debugfs_register(struct rpc_clnt *clnt) { - int len, err; + int len; char name[24]; /* enough for "../../rpc_xprt/ + 8 hex digits + NULL */ + struct rpc_xprt *xprt; /* Already registered? */ - if (clnt->cl_debugfs) - return 0; + if (clnt->cl_debugfs || !rpc_clnt_dir) + return; len = snprintf(name, sizeof(name), "%x", clnt->cl_clid); if (len >= sizeof(name)) - return -EINVAL; + return; /* make the per-client dir */ clnt->cl_debugfs = debugfs_create_dir(name, rpc_clnt_dir); if (!clnt->cl_debugfs) - return -ENOMEM; + return; /* make tasks file */ - err = -ENOMEM; if (!debugfs_create_file("tasks", S_IFREG | S_IRUSR, clnt->cl_debugfs, clnt, &tasks_fops)) goto out_err; - err = -EINVAL; rcu_read_lock(); + xprt = rcu_dereference(clnt->cl_xprt); + /* no "debugfs" dentry? Don't bother with the symlink. */ + if (!xprt->debugfs) { + rcu_read_unlock(); + return; + } len = snprintf(name, sizeof(name), "../../rpc_xprt/%s", - rcu_dereference(clnt->cl_xprt)->debugfs->d_name.name); + xprt->debugfs->d_name.name); rcu_read_unlock(); + if (len >= sizeof(name)) goto out_err; - err = -ENOMEM; if (!debugfs_create_symlink("xprt", clnt->cl_debugfs, name)) goto out_err; - return 0; + return; out_err: debugfs_remove_recursive(clnt->cl_debugfs); clnt->cl_debugfs = NULL; - return err; } void @@ -226,33 +230,33 @@ static const struct file_operations xprt_info_fops = { .release = xprt_info_release, }; -int +void rpc_xprt_debugfs_register(struct rpc_xprt *xprt) { int len, id; static atomic_t cur_id; char name[9]; /* 8 hex digits + NULL term */ + if (!rpc_xprt_dir) + return; + id = (unsigned int)atomic_inc_return(&cur_id); len = snprintf(name, sizeof(name), "%x", id); if (len >= sizeof(name)) - return -EINVAL; + return; /* make the per-client dir */ xprt->debugfs = debugfs_create_dir(name, rpc_xprt_dir); if (!xprt->debugfs) - return -ENOMEM; + return; /* make tasks file */ if (!debugfs_create_file("info", S_IFREG | S_IRUSR, xprt->debugfs, xprt, &xprt_info_fops)) { debugfs_remove_recursive(xprt->debugfs); xprt->debugfs = NULL; - return -ENOMEM; } - - return 0; } void @@ -266,14 +270,17 @@ void __exit sunrpc_debugfs_exit(void) { debugfs_remove_recursive(topdir); + topdir = NULL; + rpc_clnt_dir = NULL; + rpc_xprt_dir = NULL; } -int __init +void __init sunrpc_debugfs_init(void) { topdir = debugfs_create_dir("sunrpc", NULL); if (!topdir) - goto out; + return; rpc_clnt_dir = debugfs_create_dir("rpc_clnt", topdir); if (!rpc_clnt_dir) @@ -283,10 +290,9 @@ sunrpc_debugfs_init(void) if (!rpc_xprt_dir) goto out_remove; - return 0; + return; out_remove: debugfs_remove_recursive(topdir); topdir = NULL; -out: - return -ENOMEM; + rpc_clnt_dir = NULL; } diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index e37fbed87956..ee5d3d253102 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -98,10 +98,7 @@ init_sunrpc(void) if (err) goto out4; - err = sunrpc_debugfs_init(); - if (err) - goto out5; - + sunrpc_debugfs_init(); #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) rpc_register_sysctl(); #endif @@ -109,8 +106,6 @@ init_sunrpc(void) init_socket_xprt(); /* clnt sock transport */ return 0; -out5: - unregister_rpc_pipefs(); out4: unregister_pernet_subsys(&sunrpc_net_ops); out3: diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index e3015aede0d9..9949722d99ce 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1331,7 +1331,6 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net) */ struct rpc_xprt *xprt_create_transport(struct xprt_create *args) { - int err; struct rpc_xprt *xprt; struct xprt_class *t; @@ -1372,11 +1371,7 @@ found: return ERR_PTR(-ENOMEM); } - err = rpc_xprt_debugfs_register(xprt); - if (err) { - xprt_destroy(xprt); - return ERR_PTR(err); - } + rpc_xprt_debugfs_register(xprt); dprintk("RPC: created transport %p with %u slots\n", xprt, xprt->max_reqs); -- cgit v1.2.3 From ea96d1ecbe4fcb1df487d99309d3157b4ff5fc02 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Fri, 3 Apr 2015 14:35:59 -0400 Subject: nfs: Fetch MOUNTED_ON_FILEID when updating an inode 2ef47eb1 (NFS: Fix use of nfs_attr_use_mounted_on_fileid()) was a good start to fixing a circular directory structure warning for NFS v4 "junctioned" mountpoints. Unfortunately, further testing continued to generate this error. My server is configured like this: anna@nfsd ~ % df Filesystem Size Used Avail Use% Mounted on /dev/vda1 9.1G 2.0G 6.5G 24% / /dev/vdc1 1014M 33M 982M 4% /exports /dev/vdc2 1014M 33M 982M 4% /exports/vol1 /dev/vdc3 1014M 33M 982M 4% /exports/vol1/vol2 anna@nfsd ~ % cat /etc/exports /exports/ *(rw,async,no_subtree_check,no_root_squash) /exports/vol1/ *(rw,async,no_subtree_check,no_root_squash) /exports/vol1/vol2 *(rw,async,no_subtree_check,no_root_squash) I've been running chown across the entire mountpoint twice in a row to hit this problem. The first run succeeds, but the second one fails with the circular directory warning along with: anna@client ~ % dmesg [Apr 3 14:28] NFS: server 192.168.100.204 error: fileid changed fsid 0:39: expected fileid 0x100080, got 0x80 WHere 0x80 is the mountpoint's fileid and 0x100080 is the mounted-on fileid. This patch fixes the issue by requesting an updated mounted-on fileid from the server during nfs_update_inode(), and then checking that the fileid stored in the nfs_inode matches either the fileid or mounted-on fileid returned by the server. Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 15 ++++++++++++++- fs/nfs/nfs4proc.c | 3 ++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 83743a133204..2d8270801215 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1593,6 +1593,19 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa } EXPORT_SYMBOL_GPL(nfs_post_op_update_inode_force_wcc); + +static inline bool nfs_fileid_valid(struct nfs_inode *nfsi, + struct nfs_fattr *fattr) +{ + bool ret1 = true, ret2 = true; + + if (fattr->valid & NFS_ATTR_FATTR_FILEID) + ret1 = (nfsi->fileid == fattr->fileid); + if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) + ret2 = (nfsi->fileid == fattr->mounted_on_fileid); + return ret1 || ret2; +} + /* * Many nfs protocol calls return the new file attributes after * an operation. Here we update the inode to reflect the state @@ -1619,7 +1632,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_display_fhandle_hash(NFS_FH(inode)), atomic_read(&inode->i_count), fattr->valid); - if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) { + if (!nfs_fileid_valid(nfsi, fattr)) { printk(KERN_ERR "NFS: server %s error: fileid changed\n" "fsid %s: expected fileid 0x%Lx, got 0x%Lx\n", NFS_SERVER(inode)->nfs_client->cl_hostname, diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 9ff8c63c9cac..6d48f37cfe8a 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -185,7 +185,8 @@ const u32 nfs4_fattr_bitmap[3] = { | FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_METADATA - | FATTR4_WORD1_TIME_MODIFY, + | FATTR4_WORD1_TIME_MODIFY + | FATTR4_WORD1_MOUNTED_ON_FILEID, #ifdef CONFIG_NFS_V4_SECURITY_LABEL FATTR4_WORD2_SECURITY_LABEL #endif -- cgit v1.2.3 From 1ccbad9f9f9bd36db26a10f0b17fbaf12b3ae93a Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Thu, 9 Apr 2015 23:02:16 +0800 Subject: nfs: fix DIO good bytes calculation For direct read that has IO size larger than rsize, we'll split it into several READ requests and nfs_direct_good_bytes() would count completed bytes incorrectly by eating last zero count reply. Fix it by handling mirror and non-mirror cases differently such that we only count mirrored writes differently. This fixes 5fadeb47("nfs: count DIO good bytes correctly with mirroring"). Reported-by: Jean Spector Cc: # v3.19+ Signed-off-by: Peng Tao Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 018a06a062bd..e97a67e40b48 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -131,20 +131,25 @@ nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr) WARN_ON_ONCE(hdr->pgio_mirror_idx >= dreq->mirror_count); - count = dreq->mirrors[hdr->pgio_mirror_idx].count; - if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) { - count = hdr->io_start + hdr->good_bytes - dreq->io_start; - dreq->mirrors[hdr->pgio_mirror_idx].count = count; - } - - /* update the dreq->count by finding the minimum agreed count from all - * mirrors */ - count = dreq->mirrors[0].count; + if (dreq->mirror_count == 1) { + dreq->mirrors[hdr->pgio_mirror_idx].count += hdr->good_bytes; + dreq->count += hdr->good_bytes; + } else { + /* mirrored writes */ + count = dreq->mirrors[hdr->pgio_mirror_idx].count; + if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) { + count = hdr->io_start + hdr->good_bytes - dreq->io_start; + dreq->mirrors[hdr->pgio_mirror_idx].count = count; + } + /* update the dreq->count by finding the minimum agreed count from all + * mirrors */ + count = dreq->mirrors[0].count; - for (i = 1; i < dreq->mirror_count; i++) - count = min(count, dreq->mirrors[i].count); + for (i = 1; i < dreq->mirror_count; i++) + count = min(count, dreq->mirrors[i].count); - dreq->count = count; + dreq->count = count; + } } /* -- cgit v1.2.3 From 05f54903d9d370a4cd302a85681304d3ec59e5c1 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Thu, 9 Apr 2015 23:02:17 +0800 Subject: nfs: remove WARN_ON_ONCE from nfs_direct_good_bytes For flexfiles driver, we might choose to read from mirror index other than 0 while mirror_count is always 1 for read. Reported-by: Jean Spector Cc: # v3.19+ Cc: Weston Andros Adamson Signed-off-by: Peng Tao Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index e97a67e40b48..eeb52b434e6f 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -129,8 +129,6 @@ nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr) int i; ssize_t count; - WARN_ON_ONCE(hdr->pgio_mirror_idx >= dreq->mirror_count); - if (dreq->mirror_count == 1) { dreq->mirrors[hdr->pgio_mirror_idx].count += hdr->good_bytes; dreq->count += hdr->good_bytes; -- cgit v1.2.3 From 7c61f0d3897eeeff6f3294adb9f910ddefa8035a Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Tue, 14 Apr 2015 10:34:20 -0400 Subject: NFS: Add a stub for GETDEVICELIST d4b18c3e (pnfs: remove GETDEVICELIST implementation) removed the GETDEVICELIST operation from the NFS client, but left a "hole" in the nfs4_procedures array. This caused /proc/self/mountstats to report an operation named "51" where GETDEVICELIST used to be. This patch adds a stub to fix mountstats. Signed-off-by: Anna Schumaker Fixes: d4b18c3e (pnfs: remove GETDEVICELIST implementation) Cc: stable@vger.kernel.org # 3.17+ Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 6b28a605c697..a26880cb8ce8 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -7361,6 +7361,11 @@ nfs4_stat_to_errno(int stat) .p_name = #proc, \ } +#define STUB(proc) \ +[NFSPROC4_CLNT_##proc] = { \ + .p_name = #proc, \ +} + struct rpc_procinfo nfs4_procedures[] = { PROC(READ, enc_read, dec_read), PROC(WRITE, enc_write, dec_write), @@ -7413,6 +7418,7 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid), PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid), + STUB(GETDEVICELIST), PROC(BIND_CONN_TO_SESSION, enc_bind_conn_to_session, dec_bind_conn_to_session), PROC(DESTROY_CLIENTID, enc_destroy_clientid, dec_destroy_clientid), -- cgit v1.2.3 From f9ebd61855253078fe8b07bacaf516337f8078e8 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 15 Apr 2015 13:00:04 -0400 Subject: NFS: Remove CONFIG_NFS_V4 checks from nfs_idmap.h The idmapper is completely internal to the NFS v4 module, so this macro will always evaluate to true. This patch also removes unnecessary includes of this file from the generic NFS client. Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 1 - fs/nfs/super.c | 1 - include/linux/nfs_idmap.h | 11 ----------- 3 files changed, 13 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 19874151e95c..892aefff3630 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 322b2de02988..5cc3da9d9314 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -43,7 +43,6 @@ #include #include #include -#include #include #include #include diff --git a/include/linux/nfs_idmap.h b/include/linux/nfs_idmap.h index 333844e38f66..daaf3ea5e9d8 100644 --- a/include/linux/nfs_idmap.h +++ b/include/linux/nfs_idmap.h @@ -46,19 +46,8 @@ struct nfs_server; struct nfs_fattr; struct nfs4_string; -#if IS_ENABLED(CONFIG_NFS_V4) int nfs_idmap_init(void); void nfs_idmap_quit(void); -#else -static inline int nfs_idmap_init(void) -{ - return 0; -} - -static inline void nfs_idmap_quit(void) -{} -#endif - int nfs_idmap_new(struct nfs_client *); void nfs_idmap_delete(struct nfs_client *); -- cgit v1.2.3 From 40c64c26a43494ba9982fd1b87dc54e3819566fc Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 15 Apr 2015 13:00:05 -0400 Subject: NFS: Move nfs_idmap.h into fs/nfs/ This file is only used internally to the NFS v4 module, so it doesn't need to be in the global include path. I also renamed it from nfs_idmap.h to nfs4idmap.h to emphasize that it's an NFSv4-only include file. Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/flexfilelayout/flexfilelayout.c | 2 +- fs/nfs/idmap.c | 2 +- fs/nfs/nfs4client.c | 2 +- fs/nfs/nfs4idmap.h | 68 ++++++++++++++++++++++++++++++++++ fs/nfs/nfs4proc.c | 2 +- fs/nfs/nfs4state.c | 2 +- fs/nfs/nfs4super.c | 2 +- fs/nfs/nfs4sysctl.c | 2 +- fs/nfs/nfs4xdr.c | 2 +- include/linux/nfs_idmap.h | 68 ---------------------------------- include/uapi/linux/nfs_idmap.h | 2 +- 11 files changed, 77 insertions(+), 77 deletions(-) create mode 100644 fs/nfs/nfs4idmap.h delete mode 100644 include/linux/nfs_idmap.h diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index f3ff66e4fb6a..7d05089e52d6 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -11,10 +11,10 @@ #include #include -#include #include "flexfilelayout.h" #include "../nfs4session.h" +#include "../nfs4idmap.h" #include "../internal.h" #include "../delegation.h" #include "../nfs4trace.h" diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 857e2a99acc8..2e1737c40a29 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -36,7 +36,6 @@ #include #include #include -#include #include #include #include @@ -49,6 +48,7 @@ #include "internal.h" #include "netns.h" +#include "nfs4idmap.h" #include "nfs4trace.h" #define NFS_UINT_MAXLEN 11 diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 86d6214ea022..43ca5a7ac1da 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -4,7 +4,6 @@ */ #include #include -#include #include #include #include @@ -15,6 +14,7 @@ #include "callback.h" #include "delegation.h" #include "nfs4session.h" +#include "nfs4idmap.h" #include "pnfs.h" #include "netns.h" diff --git a/fs/nfs/nfs4idmap.h b/fs/nfs/nfs4idmap.h new file mode 100644 index 000000000000..de44d7330ab3 --- /dev/null +++ b/fs/nfs/nfs4idmap.h @@ -0,0 +1,68 @@ +/* + * fs/nfs/nfs4idmap.h + * + * UID and GID to name mapping for clients. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Marius Aamodt Eriksen + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef NFS_IDMAP_H +#define NFS_IDMAP_H + +#include +#include + + +/* Forward declaration to make this header independent of others */ +struct nfs_client; +struct nfs_server; +struct nfs_fattr; +struct nfs4_string; + +int nfs_idmap_init(void); +void nfs_idmap_quit(void); +int nfs_idmap_new(struct nfs_client *); +void nfs_idmap_delete(struct nfs_client *); + +void nfs_fattr_init_names(struct nfs_fattr *fattr, + struct nfs4_string *owner_name, + struct nfs4_string *group_name); +void nfs_fattr_free_names(struct nfs_fattr *); +void nfs_fattr_map_and_free_names(struct nfs_server *, struct nfs_fattr *); + +int nfs_map_name_to_uid(const struct nfs_server *, const char *, size_t, kuid_t *); +int nfs_map_group_to_gid(const struct nfs_server *, const char *, size_t, kgid_t *); +int nfs_map_uid_to_name(const struct nfs_server *, kuid_t, char *, size_t); +int nfs_map_gid_to_group(const struct nfs_server *, kgid_t, char *, size_t); + +int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res); + +extern unsigned int nfs_idmap_cache_timeout; +#endif /* NFS_IDMAP_H */ diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6d48f37cfe8a..60396330044f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -51,7 +51,6 @@ #include #include #include -#include #include #include #include @@ -63,6 +62,7 @@ #include "callback.h" #include "pnfs.h" #include "netns.h" +#include "nfs4idmap.h" #include "nfs4session.h" #include "fscache.h" diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index f95e3b58bbc3..935a6ffe8643 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -42,7 +42,6 @@ #include #include #include -#include #include #include #include @@ -57,6 +56,7 @@ #include "callback.h" #include "delegation.h" #include "internal.h" +#include "nfs4idmap.h" #include "nfs4session.h" #include "pnfs.h" #include "netns.h" diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index d91898193b2f..6fb7cb6b3f4b 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -3,12 +3,12 @@ */ #include #include -#include #include #include #include "delegation.h" #include "internal.h" #include "nfs4_fs.h" +#include "nfs4idmap.h" #include "dns_resolve.h" #include "pnfs.h" #include "nfs.h" diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c index b6ebe7e445f6..0fbd3ab1be22 100644 --- a/fs/nfs/nfs4sysctl.c +++ b/fs/nfs/nfs4sysctl.c @@ -6,10 +6,10 @@ * Copyright (c) 2006 Trond Myklebust */ #include -#include #include #include "nfs4_fs.h" +#include "nfs4idmap.h" #include "callback.h" static const int nfs_set_port_min = 0; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index a26880cb8ce8..0aea97841d30 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -52,10 +52,10 @@ #include #include #include -#include #include "nfs4_fs.h" #include "internal.h" +#include "nfs4idmap.h" #include "nfs4session.h" #include "pnfs.h" #include "netns.h" diff --git a/include/linux/nfs_idmap.h b/include/linux/nfs_idmap.h deleted file mode 100644 index daaf3ea5e9d8..000000000000 --- a/include/linux/nfs_idmap.h +++ /dev/null @@ -1,68 +0,0 @@ -/* - * include/linux/nfs_idmap.h - * - * UID and GID to name mapping for clients. - * - * Copyright (c) 2002 The Regents of the University of Michigan. - * All rights reserved. - * - * Marius Aamodt Eriksen - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ -#ifndef NFS_IDMAP_H -#define NFS_IDMAP_H - -#include -#include - - -/* Forward declaration to make this header independent of others */ -struct nfs_client; -struct nfs_server; -struct nfs_fattr; -struct nfs4_string; - -int nfs_idmap_init(void); -void nfs_idmap_quit(void); -int nfs_idmap_new(struct nfs_client *); -void nfs_idmap_delete(struct nfs_client *); - -void nfs_fattr_init_names(struct nfs_fattr *fattr, - struct nfs4_string *owner_name, - struct nfs4_string *group_name); -void nfs_fattr_free_names(struct nfs_fattr *); -void nfs_fattr_map_and_free_names(struct nfs_server *, struct nfs_fattr *); - -int nfs_map_name_to_uid(const struct nfs_server *, const char *, size_t, kuid_t *); -int nfs_map_group_to_gid(const struct nfs_server *, const char *, size_t, kgid_t *); -int nfs_map_uid_to_name(const struct nfs_server *, kuid_t, char *, size_t); -int nfs_map_gid_to_group(const struct nfs_server *, kgid_t, char *, size_t); - -int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res); - -extern unsigned int nfs_idmap_cache_timeout; -#endif /* NFS_IDMAP_H */ diff --git a/include/uapi/linux/nfs_idmap.h b/include/uapi/linux/nfs_idmap.h index 8d4b1c7b24d4..038e36c96669 100644 --- a/include/uapi/linux/nfs_idmap.h +++ b/include/uapi/linux/nfs_idmap.h @@ -1,5 +1,5 @@ /* - * include/linux/nfs_idmap.h + * include/uapi/linux/nfs_idmap.h * * UID and GID to name mapping for clients. * -- cgit v1.2.3 From 7b320382d0d861e46f55087b50d252410b1f402f Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 15 Apr 2015 13:00:06 -0400 Subject: NFS: Rename idmap.c to nfs4idmap.c I added the nfs4 prefix to make it obvious that this file is built into the NFS v4 module, and not the generic client. Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/Makefile | 2 +- fs/nfs/idmap.c | 792 ----------------------------------------------------- fs/nfs/nfs4idmap.c | 792 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 793 insertions(+), 793 deletions(-) delete mode 100644 fs/nfs/idmap.c create mode 100644 fs/nfs/nfs4idmap.c diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 1e987acf20c9..8664417955a2 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -22,7 +22,7 @@ nfsv3-$(CONFIG_NFS_V3_ACL) += nfs3acl.o obj-$(CONFIG_NFS_V4) += nfsv4.o CFLAGS_nfs4trace.o += -I$(src) nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \ - delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \ + delegation.o nfs4idmap.o callback.o callback_xdr.o callback_proc.o \ nfs4namespace.o nfs4getroot.o nfs4client.o nfs4session.o \ dns_resolve.o nfs4trace.o nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c deleted file mode 100644 index 2e1737c40a29..000000000000 --- a/fs/nfs/idmap.c +++ /dev/null @@ -1,792 +0,0 @@ -/* - * fs/nfs/idmap.c - * - * UID and GID to name mapping for clients. - * - * Copyright (c) 2002 The Regents of the University of Michigan. - * All rights reserved. - * - * Marius Aamodt Eriksen - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "internal.h" -#include "netns.h" -#include "nfs4idmap.h" -#include "nfs4trace.h" - -#define NFS_UINT_MAXLEN 11 - -static const struct cred *id_resolver_cache; -static struct key_type key_type_id_resolver_legacy; - -struct idmap_legacy_upcalldata { - struct rpc_pipe_msg pipe_msg; - struct idmap_msg idmap_msg; - struct key_construction *key_cons; - struct idmap *idmap; -}; - -struct idmap { - struct rpc_pipe_dir_object idmap_pdo; - struct rpc_pipe *idmap_pipe; - struct idmap_legacy_upcalldata *idmap_upcall_data; - struct mutex idmap_mutex; -}; - -/** - * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields - * @fattr: fully initialised struct nfs_fattr - * @owner_name: owner name string cache - * @group_name: group name string cache - */ -void nfs_fattr_init_names(struct nfs_fattr *fattr, - struct nfs4_string *owner_name, - struct nfs4_string *group_name) -{ - fattr->owner_name = owner_name; - fattr->group_name = group_name; -} - -static void nfs_fattr_free_owner_name(struct nfs_fattr *fattr) -{ - fattr->valid &= ~NFS_ATTR_FATTR_OWNER_NAME; - kfree(fattr->owner_name->data); -} - -static void nfs_fattr_free_group_name(struct nfs_fattr *fattr) -{ - fattr->valid &= ~NFS_ATTR_FATTR_GROUP_NAME; - kfree(fattr->group_name->data); -} - -static bool nfs_fattr_map_owner_name(struct nfs_server *server, struct nfs_fattr *fattr) -{ - struct nfs4_string *owner = fattr->owner_name; - kuid_t uid; - - if (!(fattr->valid & NFS_ATTR_FATTR_OWNER_NAME)) - return false; - if (nfs_map_name_to_uid(server, owner->data, owner->len, &uid) == 0) { - fattr->uid = uid; - fattr->valid |= NFS_ATTR_FATTR_OWNER; - } - return true; -} - -static bool nfs_fattr_map_group_name(struct nfs_server *server, struct nfs_fattr *fattr) -{ - struct nfs4_string *group = fattr->group_name; - kgid_t gid; - - if (!(fattr->valid & NFS_ATTR_FATTR_GROUP_NAME)) - return false; - if (nfs_map_group_to_gid(server, group->data, group->len, &gid) == 0) { - fattr->gid = gid; - fattr->valid |= NFS_ATTR_FATTR_GROUP; - } - return true; -} - -/** - * nfs_fattr_free_names - free up the NFSv4 owner and group strings - * @fattr: a fully initialised nfs_fattr structure - */ -void nfs_fattr_free_names(struct nfs_fattr *fattr) -{ - if (fattr->valid & NFS_ATTR_FATTR_OWNER_NAME) - nfs_fattr_free_owner_name(fattr); - if (fattr->valid & NFS_ATTR_FATTR_GROUP_NAME) - nfs_fattr_free_group_name(fattr); -} - -/** - * nfs_fattr_map_and_free_names - map owner/group strings into uid/gid and free - * @server: pointer to the filesystem nfs_server structure - * @fattr: a fully initialised nfs_fattr structure - * - * This helper maps the cached NFSv4 owner/group strings in fattr into - * their numeric uid/gid equivalents, and then frees the cached strings. - */ -void nfs_fattr_map_and_free_names(struct nfs_server *server, struct nfs_fattr *fattr) -{ - if (nfs_fattr_map_owner_name(server, fattr)) - nfs_fattr_free_owner_name(fattr); - if (nfs_fattr_map_group_name(server, fattr)) - nfs_fattr_free_group_name(fattr); -} - -int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res) -{ - unsigned long val; - char buf[16]; - - if (memchr(name, '@', namelen) != NULL || namelen >= sizeof(buf)) - return 0; - memcpy(buf, name, namelen); - buf[namelen] = '\0'; - if (kstrtoul(buf, 0, &val) != 0) - return 0; - *res = val; - return 1; -} -EXPORT_SYMBOL_GPL(nfs_map_string_to_numeric); - -static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen) -{ - return snprintf(buf, buflen, "%u", id); -} - -static struct key_type key_type_id_resolver = { - .name = "id_resolver", - .preparse = user_preparse, - .free_preparse = user_free_preparse, - .instantiate = generic_key_instantiate, - .revoke = user_revoke, - .destroy = user_destroy, - .describe = user_describe, - .read = user_read, -}; - -static int nfs_idmap_init_keyring(void) -{ - struct cred *cred; - struct key *keyring; - int ret = 0; - - printk(KERN_NOTICE "NFS: Registering the %s key type\n", - key_type_id_resolver.name); - - cred = prepare_kernel_cred(NULL); - if (!cred) - return -ENOMEM; - - keyring = keyring_alloc(".id_resolver", - GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, - (KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ, - KEY_ALLOC_NOT_IN_QUOTA, NULL); - if (IS_ERR(keyring)) { - ret = PTR_ERR(keyring); - goto failed_put_cred; - } - - ret = register_key_type(&key_type_id_resolver); - if (ret < 0) - goto failed_put_key; - - ret = register_key_type(&key_type_id_resolver_legacy); - if (ret < 0) - goto failed_reg_legacy; - - set_bit(KEY_FLAG_ROOT_CAN_CLEAR, &keyring->flags); - cred->thread_keyring = keyring; - cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; - id_resolver_cache = cred; - return 0; - -failed_reg_legacy: - unregister_key_type(&key_type_id_resolver); -failed_put_key: - key_put(keyring); -failed_put_cred: - put_cred(cred); - return ret; -} - -static void nfs_idmap_quit_keyring(void) -{ - key_revoke(id_resolver_cache->thread_keyring); - unregister_key_type(&key_type_id_resolver); - unregister_key_type(&key_type_id_resolver_legacy); - put_cred(id_resolver_cache); -} - -/* - * Assemble the description to pass to request_key() - * This function will allocate a new string and update dest to point - * at it. The caller is responsible for freeing dest. - * - * On error 0 is returned. Otherwise, the length of dest is returned. - */ -static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen, - const char *type, size_t typelen, char **desc) -{ - char *cp; - size_t desclen = typelen + namelen + 2; - - *desc = kmalloc(desclen, GFP_KERNEL); - if (!*desc) - return -ENOMEM; - - cp = *desc; - memcpy(cp, type, typelen); - cp += typelen; - *cp++ = ':'; - - memcpy(cp, name, namelen); - cp += namelen; - *cp = '\0'; - return desclen; -} - -static struct key *nfs_idmap_request_key(const char *name, size_t namelen, - const char *type, struct idmap *idmap) -{ - char *desc; - struct key *rkey; - ssize_t ret; - - ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc); - if (ret <= 0) - return ERR_PTR(ret); - - rkey = request_key(&key_type_id_resolver, desc, ""); - if (IS_ERR(rkey)) { - mutex_lock(&idmap->idmap_mutex); - rkey = request_key_with_auxdata(&key_type_id_resolver_legacy, - desc, "", 0, idmap); - mutex_unlock(&idmap->idmap_mutex); - } - if (!IS_ERR(rkey)) - set_bit(KEY_FLAG_ROOT_CAN_INVAL, &rkey->flags); - - kfree(desc); - return rkey; -} - -static ssize_t nfs_idmap_get_key(const char *name, size_t namelen, - const char *type, void *data, - size_t data_size, struct idmap *idmap) -{ - const struct cred *saved_cred; - struct key *rkey; - struct user_key_payload *payload; - ssize_t ret; - - saved_cred = override_creds(id_resolver_cache); - rkey = nfs_idmap_request_key(name, namelen, type, idmap); - revert_creds(saved_cred); - - if (IS_ERR(rkey)) { - ret = PTR_ERR(rkey); - goto out; - } - - rcu_read_lock(); - rkey->perm |= KEY_USR_VIEW; - - ret = key_validate(rkey); - if (ret < 0) - goto out_up; - - payload = rcu_dereference(rkey->payload.rcudata); - if (IS_ERR_OR_NULL(payload)) { - ret = PTR_ERR(payload); - goto out_up; - } - - ret = payload->datalen; - if (ret > 0 && ret <= data_size) - memcpy(data, payload->data, ret); - else - ret = -EINVAL; - -out_up: - rcu_read_unlock(); - key_put(rkey); -out: - return ret; -} - -/* ID -> Name */ -static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, - size_t buflen, struct idmap *idmap) -{ - char id_str[NFS_UINT_MAXLEN]; - int id_len; - ssize_t ret; - - id_len = snprintf(id_str, sizeof(id_str), "%u", id); - ret = nfs_idmap_get_key(id_str, id_len, type, buf, buflen, idmap); - if (ret < 0) - return -EINVAL; - return ret; -} - -/* Name -> ID */ -static int nfs_idmap_lookup_id(const char *name, size_t namelen, const char *type, - __u32 *id, struct idmap *idmap) -{ - char id_str[NFS_UINT_MAXLEN]; - long id_long; - ssize_t data_size; - int ret = 0; - - data_size = nfs_idmap_get_key(name, namelen, type, id_str, NFS_UINT_MAXLEN, idmap); - if (data_size <= 0) { - ret = -EINVAL; - } else { - ret = kstrtol(id_str, 10, &id_long); - *id = (__u32)id_long; - } - return ret; -} - -/* idmap classic begins here */ - -enum { - Opt_find_uid, Opt_find_gid, Opt_find_user, Opt_find_group, Opt_find_err -}; - -static const match_table_t nfs_idmap_tokens = { - { Opt_find_uid, "uid:%s" }, - { Opt_find_gid, "gid:%s" }, - { Opt_find_user, "user:%s" }, - { Opt_find_group, "group:%s" }, - { Opt_find_err, NULL } -}; - -static int nfs_idmap_legacy_upcall(struct key_construction *, const char *, void *); -static ssize_t idmap_pipe_downcall(struct file *, const char __user *, - size_t); -static void idmap_release_pipe(struct inode *); -static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *); - -static const struct rpc_pipe_ops idmap_upcall_ops = { - .upcall = rpc_pipe_generic_upcall, - .downcall = idmap_pipe_downcall, - .release_pipe = idmap_release_pipe, - .destroy_msg = idmap_pipe_destroy_msg, -}; - -static struct key_type key_type_id_resolver_legacy = { - .name = "id_legacy", - .preparse = user_preparse, - .free_preparse = user_free_preparse, - .instantiate = generic_key_instantiate, - .revoke = user_revoke, - .destroy = user_destroy, - .describe = user_describe, - .read = user_read, - .request_key = nfs_idmap_legacy_upcall, -}; - -static void nfs_idmap_pipe_destroy(struct dentry *dir, - struct rpc_pipe_dir_object *pdo) -{ - struct idmap *idmap = pdo->pdo_data; - struct rpc_pipe *pipe = idmap->idmap_pipe; - - if (pipe->dentry) { - rpc_unlink(pipe->dentry); - pipe->dentry = NULL; - } -} - -static int nfs_idmap_pipe_create(struct dentry *dir, - struct rpc_pipe_dir_object *pdo) -{ - struct idmap *idmap = pdo->pdo_data; - struct rpc_pipe *pipe = idmap->idmap_pipe; - struct dentry *dentry; - - dentry = rpc_mkpipe_dentry(dir, "idmap", idmap, pipe); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - pipe->dentry = dentry; - return 0; -} - -static const struct rpc_pipe_dir_object_ops nfs_idmap_pipe_dir_object_ops = { - .create = nfs_idmap_pipe_create, - .destroy = nfs_idmap_pipe_destroy, -}; - -int -nfs_idmap_new(struct nfs_client *clp) -{ - struct idmap *idmap; - struct rpc_pipe *pipe; - int error; - - idmap = kzalloc(sizeof(*idmap), GFP_KERNEL); - if (idmap == NULL) - return -ENOMEM; - - rpc_init_pipe_dir_object(&idmap->idmap_pdo, - &nfs_idmap_pipe_dir_object_ops, - idmap); - - pipe = rpc_mkpipe_data(&idmap_upcall_ops, 0); - if (IS_ERR(pipe)) { - error = PTR_ERR(pipe); - goto err; - } - idmap->idmap_pipe = pipe; - mutex_init(&idmap->idmap_mutex); - - error = rpc_add_pipe_dir_object(clp->cl_net, - &clp->cl_rpcclient->cl_pipedir_objects, - &idmap->idmap_pdo); - if (error) - goto err_destroy_pipe; - - clp->cl_idmap = idmap; - return 0; -err_destroy_pipe: - rpc_destroy_pipe_data(idmap->idmap_pipe); -err: - kfree(idmap); - return error; -} - -void -nfs_idmap_delete(struct nfs_client *clp) -{ - struct idmap *idmap = clp->cl_idmap; - - if (!idmap) - return; - clp->cl_idmap = NULL; - rpc_remove_pipe_dir_object(clp->cl_net, - &clp->cl_rpcclient->cl_pipedir_objects, - &idmap->idmap_pdo); - rpc_destroy_pipe_data(idmap->idmap_pipe); - kfree(idmap); -} - -int nfs_idmap_init(void) -{ - int ret; - ret = nfs_idmap_init_keyring(); - if (ret != 0) - goto out; -out: - return ret; -} - -void nfs_idmap_quit(void) -{ - nfs_idmap_quit_keyring(); -} - -static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap, - struct idmap_msg *im, - struct rpc_pipe_msg *msg) -{ - substring_t substr; - int token, ret; - - im->im_type = IDMAP_TYPE_GROUP; - token = match_token(desc, nfs_idmap_tokens, &substr); - - switch (token) { - case Opt_find_uid: - im->im_type = IDMAP_TYPE_USER; - case Opt_find_gid: - im->im_conv = IDMAP_CONV_NAMETOID; - ret = match_strlcpy(im->im_name, &substr, IDMAP_NAMESZ); - break; - - case Opt_find_user: - im->im_type = IDMAP_TYPE_USER; - case Opt_find_group: - im->im_conv = IDMAP_CONV_IDTONAME; - ret = match_int(&substr, &im->im_id); - break; - - default: - ret = -EINVAL; - goto out; - } - - msg->data = im; - msg->len = sizeof(struct idmap_msg); - -out: - return ret; -} - -static bool -nfs_idmap_prepare_pipe_upcall(struct idmap *idmap, - struct idmap_legacy_upcalldata *data) -{ - if (idmap->idmap_upcall_data != NULL) { - WARN_ON_ONCE(1); - return false; - } - idmap->idmap_upcall_data = data; - return true; -} - -static void -nfs_idmap_complete_pipe_upcall_locked(struct idmap *idmap, int ret) -{ - struct key_construction *cons = idmap->idmap_upcall_data->key_cons; - - kfree(idmap->idmap_upcall_data); - idmap->idmap_upcall_data = NULL; - complete_request_key(cons, ret); -} - -static void -nfs_idmap_abort_pipe_upcall(struct idmap *idmap, int ret) -{ - if (idmap->idmap_upcall_data != NULL) - nfs_idmap_complete_pipe_upcall_locked(idmap, ret); -} - -static int nfs_idmap_legacy_upcall(struct key_construction *cons, - const char *op, - void *aux) -{ - struct idmap_legacy_upcalldata *data; - struct rpc_pipe_msg *msg; - struct idmap_msg *im; - struct idmap *idmap = (struct idmap *)aux; - struct key *key = cons->key; - int ret = -ENOMEM; - - /* msg and im are freed in idmap_pipe_destroy_msg */ - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (!data) - goto out1; - - msg = &data->pipe_msg; - im = &data->idmap_msg; - data->idmap = idmap; - data->key_cons = cons; - - ret = nfs_idmap_prepare_message(key->description, idmap, im, msg); - if (ret < 0) - goto out2; - - ret = -EAGAIN; - if (!nfs_idmap_prepare_pipe_upcall(idmap, data)) - goto out2; - - ret = rpc_queue_upcall(idmap->idmap_pipe, msg); - if (ret < 0) - nfs_idmap_abort_pipe_upcall(idmap, ret); - - return ret; -out2: - kfree(data); -out1: - complete_request_key(cons, ret); - return ret; -} - -static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *data, size_t datalen) -{ - return key_instantiate_and_link(key, data, datalen, - id_resolver_cache->thread_keyring, - authkey); -} - -static int nfs_idmap_read_and_verify_message(struct idmap_msg *im, - struct idmap_msg *upcall, - struct key *key, struct key *authkey) -{ - char id_str[NFS_UINT_MAXLEN]; - size_t len; - int ret = -ENOKEY; - - /* ret = -ENOKEY */ - if (upcall->im_type != im->im_type || upcall->im_conv != im->im_conv) - goto out; - switch (im->im_conv) { - case IDMAP_CONV_NAMETOID: - if (strcmp(upcall->im_name, im->im_name) != 0) - break; - /* Note: here we store the NUL terminator too */ - len = sprintf(id_str, "%d", im->im_id) + 1; - ret = nfs_idmap_instantiate(key, authkey, id_str, len); - break; - case IDMAP_CONV_IDTONAME: - if (upcall->im_id != im->im_id) - break; - len = strlen(im->im_name); - ret = nfs_idmap_instantiate(key, authkey, im->im_name, len); - break; - default: - ret = -EINVAL; - } -out: - return ret; -} - -static ssize_t -idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) -{ - struct rpc_inode *rpci = RPC_I(file_inode(filp)); - struct idmap *idmap = (struct idmap *)rpci->private; - struct key_construction *cons; - struct idmap_msg im; - size_t namelen_in; - int ret = -ENOKEY; - - /* If instantiation is successful, anyone waiting for key construction - * will have been woken up and someone else may now have used - * idmap_key_cons - so after this point we may no longer touch it. - */ - if (idmap->idmap_upcall_data == NULL) - goto out_noupcall; - - cons = idmap->idmap_upcall_data->key_cons; - - if (mlen != sizeof(im)) { - ret = -ENOSPC; - goto out; - } - - if (copy_from_user(&im, src, mlen) != 0) { - ret = -EFAULT; - goto out; - } - - if (!(im.im_status & IDMAP_STATUS_SUCCESS)) { - ret = -ENOKEY; - goto out; - } - - namelen_in = strnlen(im.im_name, IDMAP_NAMESZ); - if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) { - ret = -EINVAL; - goto out; -} - - ret = nfs_idmap_read_and_verify_message(&im, - &idmap->idmap_upcall_data->idmap_msg, - cons->key, cons->authkey); - if (ret >= 0) { - key_set_timeout(cons->key, nfs_idmap_cache_timeout); - ret = mlen; - } - -out: - nfs_idmap_complete_pipe_upcall_locked(idmap, ret); -out_noupcall: - return ret; -} - -static void -idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg) -{ - struct idmap_legacy_upcalldata *data = container_of(msg, - struct idmap_legacy_upcalldata, - pipe_msg); - struct idmap *idmap = data->idmap; - - if (msg->errno) - nfs_idmap_abort_pipe_upcall(idmap, msg->errno); -} - -static void -idmap_release_pipe(struct inode *inode) -{ - struct rpc_inode *rpci = RPC_I(inode); - struct idmap *idmap = (struct idmap *)rpci->private; - - nfs_idmap_abort_pipe_upcall(idmap, -EPIPE); -} - -int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, kuid_t *uid) -{ - struct idmap *idmap = server->nfs_client->cl_idmap; - __u32 id = -1; - int ret = 0; - - if (!nfs_map_string_to_numeric(name, namelen, &id)) - ret = nfs_idmap_lookup_id(name, namelen, "uid", &id, idmap); - if (ret == 0) { - *uid = make_kuid(&init_user_ns, id); - if (!uid_valid(*uid)) - ret = -ERANGE; - } - trace_nfs4_map_name_to_uid(name, namelen, id, ret); - return ret; -} - -int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, kgid_t *gid) -{ - struct idmap *idmap = server->nfs_client->cl_idmap; - __u32 id = -1; - int ret = 0; - - if (!nfs_map_string_to_numeric(name, namelen, &id)) - ret = nfs_idmap_lookup_id(name, namelen, "gid", &id, idmap); - if (ret == 0) { - *gid = make_kgid(&init_user_ns, id); - if (!gid_valid(*gid)) - ret = -ERANGE; - } - trace_nfs4_map_group_to_gid(name, namelen, id, ret); - return ret; -} - -int nfs_map_uid_to_name(const struct nfs_server *server, kuid_t uid, char *buf, size_t buflen) -{ - struct idmap *idmap = server->nfs_client->cl_idmap; - int ret = -EINVAL; - __u32 id; - - id = from_kuid(&init_user_ns, uid); - if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) - ret = nfs_idmap_lookup_name(id, "user", buf, buflen, idmap); - if (ret < 0) - ret = nfs_map_numeric_to_string(id, buf, buflen); - trace_nfs4_map_uid_to_name(buf, ret, id, ret); - return ret; -} -int nfs_map_gid_to_group(const struct nfs_server *server, kgid_t gid, char *buf, size_t buflen) -{ - struct idmap *idmap = server->nfs_client->cl_idmap; - int ret = -EINVAL; - __u32 id; - - id = from_kgid(&init_user_ns, gid); - if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) - ret = nfs_idmap_lookup_name(id, "group", buf, buflen, idmap); - if (ret < 0) - ret = nfs_map_numeric_to_string(id, buf, buflen); - trace_nfs4_map_gid_to_group(buf, ret, id, ret); - return ret; -} diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c new file mode 100644 index 000000000000..2e1737c40a29 --- /dev/null +++ b/fs/nfs/nfs4idmap.c @@ -0,0 +1,792 @@ +/* + * fs/nfs/idmap.c + * + * UID and GID to name mapping for clients. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Marius Aamodt Eriksen + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" +#include "netns.h" +#include "nfs4idmap.h" +#include "nfs4trace.h" + +#define NFS_UINT_MAXLEN 11 + +static const struct cred *id_resolver_cache; +static struct key_type key_type_id_resolver_legacy; + +struct idmap_legacy_upcalldata { + struct rpc_pipe_msg pipe_msg; + struct idmap_msg idmap_msg; + struct key_construction *key_cons; + struct idmap *idmap; +}; + +struct idmap { + struct rpc_pipe_dir_object idmap_pdo; + struct rpc_pipe *idmap_pipe; + struct idmap_legacy_upcalldata *idmap_upcall_data; + struct mutex idmap_mutex; +}; + +/** + * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields + * @fattr: fully initialised struct nfs_fattr + * @owner_name: owner name string cache + * @group_name: group name string cache + */ +void nfs_fattr_init_names(struct nfs_fattr *fattr, + struct nfs4_string *owner_name, + struct nfs4_string *group_name) +{ + fattr->owner_name = owner_name; + fattr->group_name = group_name; +} + +static void nfs_fattr_free_owner_name(struct nfs_fattr *fattr) +{ + fattr->valid &= ~NFS_ATTR_FATTR_OWNER_NAME; + kfree(fattr->owner_name->data); +} + +static void nfs_fattr_free_group_name(struct nfs_fattr *fattr) +{ + fattr->valid &= ~NFS_ATTR_FATTR_GROUP_NAME; + kfree(fattr->group_name->data); +} + +static bool nfs_fattr_map_owner_name(struct nfs_server *server, struct nfs_fattr *fattr) +{ + struct nfs4_string *owner = fattr->owner_name; + kuid_t uid; + + if (!(fattr->valid & NFS_ATTR_FATTR_OWNER_NAME)) + return false; + if (nfs_map_name_to_uid(server, owner->data, owner->len, &uid) == 0) { + fattr->uid = uid; + fattr->valid |= NFS_ATTR_FATTR_OWNER; + } + return true; +} + +static bool nfs_fattr_map_group_name(struct nfs_server *server, struct nfs_fattr *fattr) +{ + struct nfs4_string *group = fattr->group_name; + kgid_t gid; + + if (!(fattr->valid & NFS_ATTR_FATTR_GROUP_NAME)) + return false; + if (nfs_map_group_to_gid(server, group->data, group->len, &gid) == 0) { + fattr->gid = gid; + fattr->valid |= NFS_ATTR_FATTR_GROUP; + } + return true; +} + +/** + * nfs_fattr_free_names - free up the NFSv4 owner and group strings + * @fattr: a fully initialised nfs_fattr structure + */ +void nfs_fattr_free_names(struct nfs_fattr *fattr) +{ + if (fattr->valid & NFS_ATTR_FATTR_OWNER_NAME) + nfs_fattr_free_owner_name(fattr); + if (fattr->valid & NFS_ATTR_FATTR_GROUP_NAME) + nfs_fattr_free_group_name(fattr); +} + +/** + * nfs_fattr_map_and_free_names - map owner/group strings into uid/gid and free + * @server: pointer to the filesystem nfs_server structure + * @fattr: a fully initialised nfs_fattr structure + * + * This helper maps the cached NFSv4 owner/group strings in fattr into + * their numeric uid/gid equivalents, and then frees the cached strings. + */ +void nfs_fattr_map_and_free_names(struct nfs_server *server, struct nfs_fattr *fattr) +{ + if (nfs_fattr_map_owner_name(server, fattr)) + nfs_fattr_free_owner_name(fattr); + if (nfs_fattr_map_group_name(server, fattr)) + nfs_fattr_free_group_name(fattr); +} + +int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res) +{ + unsigned long val; + char buf[16]; + + if (memchr(name, '@', namelen) != NULL || namelen >= sizeof(buf)) + return 0; + memcpy(buf, name, namelen); + buf[namelen] = '\0'; + if (kstrtoul(buf, 0, &val) != 0) + return 0; + *res = val; + return 1; +} +EXPORT_SYMBOL_GPL(nfs_map_string_to_numeric); + +static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen) +{ + return snprintf(buf, buflen, "%u", id); +} + +static struct key_type key_type_id_resolver = { + .name = "id_resolver", + .preparse = user_preparse, + .free_preparse = user_free_preparse, + .instantiate = generic_key_instantiate, + .revoke = user_revoke, + .destroy = user_destroy, + .describe = user_describe, + .read = user_read, +}; + +static int nfs_idmap_init_keyring(void) +{ + struct cred *cred; + struct key *keyring; + int ret = 0; + + printk(KERN_NOTICE "NFS: Registering the %s key type\n", + key_type_id_resolver.name); + + cred = prepare_kernel_cred(NULL); + if (!cred) + return -ENOMEM; + + keyring = keyring_alloc(".id_resolver", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA, NULL); + if (IS_ERR(keyring)) { + ret = PTR_ERR(keyring); + goto failed_put_cred; + } + + ret = register_key_type(&key_type_id_resolver); + if (ret < 0) + goto failed_put_key; + + ret = register_key_type(&key_type_id_resolver_legacy); + if (ret < 0) + goto failed_reg_legacy; + + set_bit(KEY_FLAG_ROOT_CAN_CLEAR, &keyring->flags); + cred->thread_keyring = keyring; + cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; + id_resolver_cache = cred; + return 0; + +failed_reg_legacy: + unregister_key_type(&key_type_id_resolver); +failed_put_key: + key_put(keyring); +failed_put_cred: + put_cred(cred); + return ret; +} + +static void nfs_idmap_quit_keyring(void) +{ + key_revoke(id_resolver_cache->thread_keyring); + unregister_key_type(&key_type_id_resolver); + unregister_key_type(&key_type_id_resolver_legacy); + put_cred(id_resolver_cache); +} + +/* + * Assemble the description to pass to request_key() + * This function will allocate a new string and update dest to point + * at it. The caller is responsible for freeing dest. + * + * On error 0 is returned. Otherwise, the length of dest is returned. + */ +static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen, + const char *type, size_t typelen, char **desc) +{ + char *cp; + size_t desclen = typelen + namelen + 2; + + *desc = kmalloc(desclen, GFP_KERNEL); + if (!*desc) + return -ENOMEM; + + cp = *desc; + memcpy(cp, type, typelen); + cp += typelen; + *cp++ = ':'; + + memcpy(cp, name, namelen); + cp += namelen; + *cp = '\0'; + return desclen; +} + +static struct key *nfs_idmap_request_key(const char *name, size_t namelen, + const char *type, struct idmap *idmap) +{ + char *desc; + struct key *rkey; + ssize_t ret; + + ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc); + if (ret <= 0) + return ERR_PTR(ret); + + rkey = request_key(&key_type_id_resolver, desc, ""); + if (IS_ERR(rkey)) { + mutex_lock(&idmap->idmap_mutex); + rkey = request_key_with_auxdata(&key_type_id_resolver_legacy, + desc, "", 0, idmap); + mutex_unlock(&idmap->idmap_mutex); + } + if (!IS_ERR(rkey)) + set_bit(KEY_FLAG_ROOT_CAN_INVAL, &rkey->flags); + + kfree(desc); + return rkey; +} + +static ssize_t nfs_idmap_get_key(const char *name, size_t namelen, + const char *type, void *data, + size_t data_size, struct idmap *idmap) +{ + const struct cred *saved_cred; + struct key *rkey; + struct user_key_payload *payload; + ssize_t ret; + + saved_cred = override_creds(id_resolver_cache); + rkey = nfs_idmap_request_key(name, namelen, type, idmap); + revert_creds(saved_cred); + + if (IS_ERR(rkey)) { + ret = PTR_ERR(rkey); + goto out; + } + + rcu_read_lock(); + rkey->perm |= KEY_USR_VIEW; + + ret = key_validate(rkey); + if (ret < 0) + goto out_up; + + payload = rcu_dereference(rkey->payload.rcudata); + if (IS_ERR_OR_NULL(payload)) { + ret = PTR_ERR(payload); + goto out_up; + } + + ret = payload->datalen; + if (ret > 0 && ret <= data_size) + memcpy(data, payload->data, ret); + else + ret = -EINVAL; + +out_up: + rcu_read_unlock(); + key_put(rkey); +out: + return ret; +} + +/* ID -> Name */ +static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, + size_t buflen, struct idmap *idmap) +{ + char id_str[NFS_UINT_MAXLEN]; + int id_len; + ssize_t ret; + + id_len = snprintf(id_str, sizeof(id_str), "%u", id); + ret = nfs_idmap_get_key(id_str, id_len, type, buf, buflen, idmap); + if (ret < 0) + return -EINVAL; + return ret; +} + +/* Name -> ID */ +static int nfs_idmap_lookup_id(const char *name, size_t namelen, const char *type, + __u32 *id, struct idmap *idmap) +{ + char id_str[NFS_UINT_MAXLEN]; + long id_long; + ssize_t data_size; + int ret = 0; + + data_size = nfs_idmap_get_key(name, namelen, type, id_str, NFS_UINT_MAXLEN, idmap); + if (data_size <= 0) { + ret = -EINVAL; + } else { + ret = kstrtol(id_str, 10, &id_long); + *id = (__u32)id_long; + } + return ret; +} + +/* idmap classic begins here */ + +enum { + Opt_find_uid, Opt_find_gid, Opt_find_user, Opt_find_group, Opt_find_err +}; + +static const match_table_t nfs_idmap_tokens = { + { Opt_find_uid, "uid:%s" }, + { Opt_find_gid, "gid:%s" }, + { Opt_find_user, "user:%s" }, + { Opt_find_group, "group:%s" }, + { Opt_find_err, NULL } +}; + +static int nfs_idmap_legacy_upcall(struct key_construction *, const char *, void *); +static ssize_t idmap_pipe_downcall(struct file *, const char __user *, + size_t); +static void idmap_release_pipe(struct inode *); +static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *); + +static const struct rpc_pipe_ops idmap_upcall_ops = { + .upcall = rpc_pipe_generic_upcall, + .downcall = idmap_pipe_downcall, + .release_pipe = idmap_release_pipe, + .destroy_msg = idmap_pipe_destroy_msg, +}; + +static struct key_type key_type_id_resolver_legacy = { + .name = "id_legacy", + .preparse = user_preparse, + .free_preparse = user_free_preparse, + .instantiate = generic_key_instantiate, + .revoke = user_revoke, + .destroy = user_destroy, + .describe = user_describe, + .read = user_read, + .request_key = nfs_idmap_legacy_upcall, +}; + +static void nfs_idmap_pipe_destroy(struct dentry *dir, + struct rpc_pipe_dir_object *pdo) +{ + struct idmap *idmap = pdo->pdo_data; + struct rpc_pipe *pipe = idmap->idmap_pipe; + + if (pipe->dentry) { + rpc_unlink(pipe->dentry); + pipe->dentry = NULL; + } +} + +static int nfs_idmap_pipe_create(struct dentry *dir, + struct rpc_pipe_dir_object *pdo) +{ + struct idmap *idmap = pdo->pdo_data; + struct rpc_pipe *pipe = idmap->idmap_pipe; + struct dentry *dentry; + + dentry = rpc_mkpipe_dentry(dir, "idmap", idmap, pipe); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + pipe->dentry = dentry; + return 0; +} + +static const struct rpc_pipe_dir_object_ops nfs_idmap_pipe_dir_object_ops = { + .create = nfs_idmap_pipe_create, + .destroy = nfs_idmap_pipe_destroy, +}; + +int +nfs_idmap_new(struct nfs_client *clp) +{ + struct idmap *idmap; + struct rpc_pipe *pipe; + int error; + + idmap = kzalloc(sizeof(*idmap), GFP_KERNEL); + if (idmap == NULL) + return -ENOMEM; + + rpc_init_pipe_dir_object(&idmap->idmap_pdo, + &nfs_idmap_pipe_dir_object_ops, + idmap); + + pipe = rpc_mkpipe_data(&idmap_upcall_ops, 0); + if (IS_ERR(pipe)) { + error = PTR_ERR(pipe); + goto err; + } + idmap->idmap_pipe = pipe; + mutex_init(&idmap->idmap_mutex); + + error = rpc_add_pipe_dir_object(clp->cl_net, + &clp->cl_rpcclient->cl_pipedir_objects, + &idmap->idmap_pdo); + if (error) + goto err_destroy_pipe; + + clp->cl_idmap = idmap; + return 0; +err_destroy_pipe: + rpc_destroy_pipe_data(idmap->idmap_pipe); +err: + kfree(idmap); + return error; +} + +void +nfs_idmap_delete(struct nfs_client *clp) +{ + struct idmap *idmap = clp->cl_idmap; + + if (!idmap) + return; + clp->cl_idmap = NULL; + rpc_remove_pipe_dir_object(clp->cl_net, + &clp->cl_rpcclient->cl_pipedir_objects, + &idmap->idmap_pdo); + rpc_destroy_pipe_data(idmap->idmap_pipe); + kfree(idmap); +} + +int nfs_idmap_init(void) +{ + int ret; + ret = nfs_idmap_init_keyring(); + if (ret != 0) + goto out; +out: + return ret; +} + +void nfs_idmap_quit(void) +{ + nfs_idmap_quit_keyring(); +} + +static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap, + struct idmap_msg *im, + struct rpc_pipe_msg *msg) +{ + substring_t substr; + int token, ret; + + im->im_type = IDMAP_TYPE_GROUP; + token = match_token(desc, nfs_idmap_tokens, &substr); + + switch (token) { + case Opt_find_uid: + im->im_type = IDMAP_TYPE_USER; + case Opt_find_gid: + im->im_conv = IDMAP_CONV_NAMETOID; + ret = match_strlcpy(im->im_name, &substr, IDMAP_NAMESZ); + break; + + case Opt_find_user: + im->im_type = IDMAP_TYPE_USER; + case Opt_find_group: + im->im_conv = IDMAP_CONV_IDTONAME; + ret = match_int(&substr, &im->im_id); + break; + + default: + ret = -EINVAL; + goto out; + } + + msg->data = im; + msg->len = sizeof(struct idmap_msg); + +out: + return ret; +} + +static bool +nfs_idmap_prepare_pipe_upcall(struct idmap *idmap, + struct idmap_legacy_upcalldata *data) +{ + if (idmap->idmap_upcall_data != NULL) { + WARN_ON_ONCE(1); + return false; + } + idmap->idmap_upcall_data = data; + return true; +} + +static void +nfs_idmap_complete_pipe_upcall_locked(struct idmap *idmap, int ret) +{ + struct key_construction *cons = idmap->idmap_upcall_data->key_cons; + + kfree(idmap->idmap_upcall_data); + idmap->idmap_upcall_data = NULL; + complete_request_key(cons, ret); +} + +static void +nfs_idmap_abort_pipe_upcall(struct idmap *idmap, int ret) +{ + if (idmap->idmap_upcall_data != NULL) + nfs_idmap_complete_pipe_upcall_locked(idmap, ret); +} + +static int nfs_idmap_legacy_upcall(struct key_construction *cons, + const char *op, + void *aux) +{ + struct idmap_legacy_upcalldata *data; + struct rpc_pipe_msg *msg; + struct idmap_msg *im; + struct idmap *idmap = (struct idmap *)aux; + struct key *key = cons->key; + int ret = -ENOMEM; + + /* msg and im are freed in idmap_pipe_destroy_msg */ + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + goto out1; + + msg = &data->pipe_msg; + im = &data->idmap_msg; + data->idmap = idmap; + data->key_cons = cons; + + ret = nfs_idmap_prepare_message(key->description, idmap, im, msg); + if (ret < 0) + goto out2; + + ret = -EAGAIN; + if (!nfs_idmap_prepare_pipe_upcall(idmap, data)) + goto out2; + + ret = rpc_queue_upcall(idmap->idmap_pipe, msg); + if (ret < 0) + nfs_idmap_abort_pipe_upcall(idmap, ret); + + return ret; +out2: + kfree(data); +out1: + complete_request_key(cons, ret); + return ret; +} + +static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *data, size_t datalen) +{ + return key_instantiate_and_link(key, data, datalen, + id_resolver_cache->thread_keyring, + authkey); +} + +static int nfs_idmap_read_and_verify_message(struct idmap_msg *im, + struct idmap_msg *upcall, + struct key *key, struct key *authkey) +{ + char id_str[NFS_UINT_MAXLEN]; + size_t len; + int ret = -ENOKEY; + + /* ret = -ENOKEY */ + if (upcall->im_type != im->im_type || upcall->im_conv != im->im_conv) + goto out; + switch (im->im_conv) { + case IDMAP_CONV_NAMETOID: + if (strcmp(upcall->im_name, im->im_name) != 0) + break; + /* Note: here we store the NUL terminator too */ + len = sprintf(id_str, "%d", im->im_id) + 1; + ret = nfs_idmap_instantiate(key, authkey, id_str, len); + break; + case IDMAP_CONV_IDTONAME: + if (upcall->im_id != im->im_id) + break; + len = strlen(im->im_name); + ret = nfs_idmap_instantiate(key, authkey, im->im_name, len); + break; + default: + ret = -EINVAL; + } +out: + return ret; +} + +static ssize_t +idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) +{ + struct rpc_inode *rpci = RPC_I(file_inode(filp)); + struct idmap *idmap = (struct idmap *)rpci->private; + struct key_construction *cons; + struct idmap_msg im; + size_t namelen_in; + int ret = -ENOKEY; + + /* If instantiation is successful, anyone waiting for key construction + * will have been woken up and someone else may now have used + * idmap_key_cons - so after this point we may no longer touch it. + */ + if (idmap->idmap_upcall_data == NULL) + goto out_noupcall; + + cons = idmap->idmap_upcall_data->key_cons; + + if (mlen != sizeof(im)) { + ret = -ENOSPC; + goto out; + } + + if (copy_from_user(&im, src, mlen) != 0) { + ret = -EFAULT; + goto out; + } + + if (!(im.im_status & IDMAP_STATUS_SUCCESS)) { + ret = -ENOKEY; + goto out; + } + + namelen_in = strnlen(im.im_name, IDMAP_NAMESZ); + if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) { + ret = -EINVAL; + goto out; +} + + ret = nfs_idmap_read_and_verify_message(&im, + &idmap->idmap_upcall_data->idmap_msg, + cons->key, cons->authkey); + if (ret >= 0) { + key_set_timeout(cons->key, nfs_idmap_cache_timeout); + ret = mlen; + } + +out: + nfs_idmap_complete_pipe_upcall_locked(idmap, ret); +out_noupcall: + return ret; +} + +static void +idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg) +{ + struct idmap_legacy_upcalldata *data = container_of(msg, + struct idmap_legacy_upcalldata, + pipe_msg); + struct idmap *idmap = data->idmap; + + if (msg->errno) + nfs_idmap_abort_pipe_upcall(idmap, msg->errno); +} + +static void +idmap_release_pipe(struct inode *inode) +{ + struct rpc_inode *rpci = RPC_I(inode); + struct idmap *idmap = (struct idmap *)rpci->private; + + nfs_idmap_abort_pipe_upcall(idmap, -EPIPE); +} + +int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, kuid_t *uid) +{ + struct idmap *idmap = server->nfs_client->cl_idmap; + __u32 id = -1; + int ret = 0; + + if (!nfs_map_string_to_numeric(name, namelen, &id)) + ret = nfs_idmap_lookup_id(name, namelen, "uid", &id, idmap); + if (ret == 0) { + *uid = make_kuid(&init_user_ns, id); + if (!uid_valid(*uid)) + ret = -ERANGE; + } + trace_nfs4_map_name_to_uid(name, namelen, id, ret); + return ret; +} + +int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, kgid_t *gid) +{ + struct idmap *idmap = server->nfs_client->cl_idmap; + __u32 id = -1; + int ret = 0; + + if (!nfs_map_string_to_numeric(name, namelen, &id)) + ret = nfs_idmap_lookup_id(name, namelen, "gid", &id, idmap); + if (ret == 0) { + *gid = make_kgid(&init_user_ns, id); + if (!gid_valid(*gid)) + ret = -ERANGE; + } + trace_nfs4_map_group_to_gid(name, namelen, id, ret); + return ret; +} + +int nfs_map_uid_to_name(const struct nfs_server *server, kuid_t uid, char *buf, size_t buflen) +{ + struct idmap *idmap = server->nfs_client->cl_idmap; + int ret = -EINVAL; + __u32 id; + + id = from_kuid(&init_user_ns, uid); + if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) + ret = nfs_idmap_lookup_name(id, "user", buf, buflen, idmap); + if (ret < 0) + ret = nfs_map_numeric_to_string(id, buf, buflen); + trace_nfs4_map_uid_to_name(buf, ret, id, ret); + return ret; +} +int nfs_map_gid_to_group(const struct nfs_server *server, kgid_t gid, char *buf, size_t buflen) +{ + struct idmap *idmap = server->nfs_client->cl_idmap; + int ret = -EINVAL; + __u32 id; + + id = from_kgid(&init_user_ns, gid); + if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) + ret = nfs_idmap_lookup_name(id, "group", buf, buflen, idmap); + if (ret < 0) + ret = nfs_map_numeric_to_string(id, buf, buflen); + trace_nfs4_map_gid_to_group(buf, ret, id, ret); + return ret; +} -- cgit v1.2.3 From 3708f842e107b9b79d54a75d152e666b693649e8 Mon Sep 17 00:00:00 2001 From: Nicolas Iooss Date: Thu, 16 Apr 2015 18:48:39 +0800 Subject: Revert "nfs: replace nfs_add_stats with nfs_inc_stats when add one" This reverts commit 5a254d08b086d80cbead2ebcee6d2a4b3a15587a. Since commit 5a254d08b086 ("nfs: replace nfs_add_stats with nfs_inc_stats when add one"), nfs_readpage and nfs_do_writepage use nfs_inc_stats to increment NFSIOS_READPAGES and NFSIOS_WRITEPAGES instead of nfs_add_stats. However nfs_inc_stats does not do the same thing as nfs_add_stats with value 1 because these functions work on distinct stats: nfs_inc_stats increments stats from "enum nfs_stat_eventcounters" (in server->io_stats->events) and nfs_add_stats those from "enum nfs_stat_bytecounters" (in server->io_stats->bytes). Signed-off-by: Nicolas Iooss Fixes: 5a254d08b086 ("nfs: replace nfs_add_stats with nfs_inc_stats...") Cc: stable@vger.kernel.org # 3.19+ Signed-off-by: Trond Myklebust --- fs/nfs/read.c | 2 +- fs/nfs/write.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 568ecf0a880f..848d8b1db4ce 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -284,7 +284,7 @@ int nfs_readpage(struct file *file, struct page *page) dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", page, PAGE_CACHE_SIZE, page_file_index(page)); nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); - nfs_inc_stats(inode, NFSIOS_READPAGES); + nfs_add_stats(inode, NFSIOS_READPAGES, 1); /* * Try to flush any pending writes to the file.. diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 7f933a39f385..36d0c0a6e68a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -580,7 +580,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st int ret; nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); - nfs_inc_stats(inode, NFSIOS_WRITEPAGES); + nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); nfs_pageio_cond_complete(pgio, page_file_index(page)); ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); -- cgit v1.2.3 From ce85cfbed6fe3dbc01bd1976b23ac3e97878cde6 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Tue, 21 Apr 2015 14:17:35 -0400 Subject: NFS: Don't attempt to decode missing directory entries If a READDIR reply comes back without any page data, avoid a NULL pointer dereference in xdr_copy_to_scratch(). BUG: unable to handle kernel NULL pointer dereference at 0000000000000001 IP: [] memcpy+0xd/0x110 ... Call Trace: ? xdr_inline_decode+0x7a/0xb0 [sunrpc] nfs3_decode_dirent+0x73/0x320 [nfsv3] nfs_readdir_page_filler+0xd5/0x4e0 [nfs] ? nfs3_rpc_wrapper.constprop.9+0x42/0xc0 [nfsv3] nfs_readdir_xdr_to_array+0x1fa/0x330 [nfs] ? mem_cgroup_commit_charge+0xac/0x160 ? nfs_readdir_xdr_to_array+0x330/0x330 [nfs] nfs_readdir_filler+0x22/0x90 [nfs] do_read_cache_page+0x7e/0x1a0 read_cache_page+0x1c/0x20 nfs_readdir+0x18e/0x660 [nfs] ? nfs3_xdr_dec_getattr3res+0x80/0x80 [nfsv3] iterate_dir+0x97/0x130 SyS_getdents+0x94/0x120 ? fillonedir+0xd0/0xd0 system_call_fastpath+0x12/0x17 Signed-off-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index c19e16f0b2d0..a00ba92acccc 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -544,6 +544,9 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en if (scratch == NULL) return -ENOMEM; + if (buflen == 0) + goto out_nopages; + xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen); xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); @@ -565,6 +568,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en break; } while (!entry->eof); +out_nopages: if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) { array = nfs_readdir_get_array(page); if (!IS_ERR(array)) { -- cgit v1.2.3 From c456aacf3c27b9a3013d4524fdc4bd0bb61fb884 Mon Sep 17 00:00:00 2001 From: Firo Yang Date: Thu, 23 Apr 2015 17:17:51 +0800 Subject: nfs: Remove unneeded casts in nfs Don't unnecessarily cast allocation return value in fs/nfs/inode.c::nfs_alloc_inode(). Signed-off-by: Firo Yang Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 2d8270801215..8a464684e8c8 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1837,7 +1837,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) struct inode *nfs_alloc_inode(struct super_block *sb) { struct nfs_inode *nfsi; - nfsi = (struct nfs_inode *)kmem_cache_alloc(nfs_inode_cachep, GFP_KERNEL); + nfsi = kmem_cache_alloc(nfs_inode_cachep, GFP_KERNEL); if (!nfsi) return NULL; nfsi->flags = 0UL; -- cgit v1.2.3 From c7757074839f2cd440521482d76ea180d0d4bdac Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Thu, 23 Apr 2015 17:17:40 +0100 Subject: fs/nfs: fix new compiler warning about boolean in switch The brand new GCC 5.1.0 warns by default on using a boolean in the switch condition. This results in the following warning: fs/nfs/nfs4proc.c: In function 'nfs4_proc_get_rootfh': fs/nfs/nfs4proc.c:3100:10: warning: switch condition has boolean value [-Wswitch-bool] switch (auth_probe) { ^ This code was obviously using switch to make use of the fall-through semantics (without the usual comment, though). Rewrite that code using if statements to avoid the warning and make the code a bit more readable on the way. Signed-off-by: Andre Przywara Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 60396330044f..addf17d25bea 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3096,16 +3096,13 @@ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *info, bool auth_probe) { - int status; + int status = 0; - switch (auth_probe) { - case false: + if (!auth_probe) status = nfs4_lookup_root(server, fhandle, info); - if (status != -NFS4ERR_WRONGSEC) - break; - default: + + if (auth_probe || status == NFS4ERR_WRONGSEC) status = nfs4_do_find_root_sec(server, fhandle, info); - } if (status == 0) status = nfs4_server_capabilities(server, fhandle); -- cgit v1.2.3