diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-06-23 18:36:41 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-06-23 18:36:41 -0700 |
| commit | 840ef6c78e6a2f694b578ecb9063241c992aaa9e (patch) | |
| tree | d5915e31458e709297d3487482288cde25dcca70 | |
| parent | 09ca8dc7d634f69d0b43f82c244add44cf7885b4 (diff) | |
| parent | 284ea3fb4f6715201e1d9ef3474c25e817ad70e9 (diff) | |
| download | lwn-840ef6c78e6a2f694b578ecb9063241c992aaa9e.tar.gz lwn-840ef6c78e6a2f694b578ecb9063241c992aaa9e.zip | |
Merge tag 'nfs-for-7.2-1' of git://git.linux-nfs.org/projects/anna/linux-nfs
Pull NFS client updates from Anna Schumaker:
"New features:
- XPRTRDMA: Decouple req recycling from RPC completion
- NFS: Expose FMODE_NOWAIT for read-only files
Bugfixes:
- SUNRPC:
- Fix sunrpc sysfs error handling
- Fix uninitialized xprt_create_args structure
- XPRTRDMA:
- Harden connect and reply handling
- NFS:
- Fix EOF updates after fallocate/zero-range
- Keep PG_UPTODATE clear after read errors in page groups
- Use nfsi->rwsem to protect traversal of the file lock list
- Prevent resource leak in nfs_alloc_server()
- NFSv4:
- Clear exception state on successful mkdir retry
- Don't skip revalidate when holding a dir delegation and attrs are stale
- pNFS:
- Fix use-after-free in pnfs_update_layout()
- Defer return_range callbacks until after inode unlock
- Fix LAYOUTCOMMIT retry loop on OLD_STATEID
- Reject zero-length r_addr in nfs4_decode_mp_ds_addr
- NFS/flexfiles:
- Reject zero-length filehandle version arrays
- Fix checking if a layout is striped
- Fixes for honoring FF_FLAGS_NO_IO_THRU_MDS
Other cleanups and improvements:
- Remove the fileid field from struct nfs_inode
- Move long-delayed xprtrdma work onto the system_dfl_long_wq
- Convert xprtrdma send buffer free list to an llist
- Show "<redacted>" for cert_serial and privkey_serial mount options"
* tag 'nfs-for-7.2-1' of git://git.linux-nfs.org/projects/anna/linux-nfs: (42 commits)
NFS: Use common error handling code in nfs_alloc_server()
NFS: Prevent resource leak in nfs_alloc_server()
NFSv4/pNFS: reject zero-length r_addr in nfs4_decode_mp_ds_addr
nfs: don't skip revalidate on directory delegation when attrs flagged stale
xprtrdma: Return sendctx slot after Send preparation failure
xprtrdma: Repost Receive buffers for malformed replies
xprtrdma: Sanitize the reply credit grant after parsing
xprtrdma: Fix bcall rep leak and unbounded peek
xprtrdma: Resize reply buffers before reposting receives
xprtrdma: Check frwr_wp_create() during connect
xprtrdma: Initialize re_id before removal registration
xprtrdma: Fix ep kref imbalance on ADDR_CHANGE
xprtrdma: Convert send buffer free list to llist
NFS: correct CONFIG_NFS_V4 macro name in #endif comment
nfs: use nfsi->rwsem to protect traversal of the file lock list
NFSv4.1/pNFS: fix LAYOUTCOMMIT retry loop on OLD_STATEID
nfs: expose FMODE_NOWAIT for read-only files
nfs: add nowait version of nfs_start_io_direct
NFSv4/flexfiles: honor FF_FLAGS_NO_IO_THRU_MDS in pg_get_mirror_count_write
NFSv4/flexfiles: honor FF_FLAGS_NO_IO_THRU_MDS on fatal DS connect errors
...
35 files changed, 703 insertions, 326 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a68003c3599c..b5493a7f8f22 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4288,13 +4288,6 @@ Kernel parameters Only applies if the softerr mount option is enabled, and the specified value is >= 0. - nfs.enable_ino64= - [NFS] enable 64-bit inode numbers. - If zero, the NFS client will fake up a 32-bit inode - number for the readdir() and stat() syscalls instead - of returning the full 64-bit number. - The default is to return 64-bit inode numbers. - nfs.idmap_cache_timeout= [NFS] set the maximum lifetime for idmapper cache entries. diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 4ea9221ded42..10f2354ba304 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -257,6 +257,7 @@ static u32 initiate_file_draining(struct nfs_client *clp, struct pnfs_layout_hdr *lo; u32 rv = NFS4ERR_NOMATCHING_LAYOUT; LIST_HEAD(free_me_list); + bool return_range = false; ino = nfs_layout_find_inode(clp, &args->cbl_fh, &args->cbl_stateid); if (IS_ERR(ino)) { @@ -301,13 +302,13 @@ static u32 initiate_file_draining(struct nfs_client *clp, /* Embrace your forgetfulness! */ rv = NFS4ERR_NOMATCHING_LAYOUT; - if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { - NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, - &args->cbl_range); - } + return_range = true; } unlock: spin_unlock(&ino->i_lock); + if (return_range && NFS_SERVER(ino)->pnfs_curr_ld->return_range) + NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, + &args->cbl_range); pnfs_free_lseg_list(&free_me_list); /* Free all lsegs that are attached to commit buckets */ nfs_commit_inode(ino, 0); diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 73b95318ba48..4dcb91ab3039 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1063,10 +1063,8 @@ struct nfs_server *nfs_alloc_server(void) return NULL; server->s_sysfs_id = ida_alloc(&s_sysfs_ids, GFP_KERNEL); - if (server->s_sysfs_id < 0) { - kfree(server); - return NULL; - } + if (server->s_sysfs_id < 0) + goto free_server; server->client = server->client_acl = ERR_PTR(-EINVAL); @@ -1088,8 +1086,8 @@ struct nfs_server *nfs_alloc_server(void) server->io_stats = nfs_alloc_iostats(); if (!server->io_stats) { - kfree(server); - return NULL; + ida_free(&s_sysfs_ids, server->s_sysfs_id); + goto free_server; } server->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED; @@ -1103,6 +1101,10 @@ struct nfs_server *nfs_alloc_server(void) rpc_init_wait_queue(&server->uoc_rpcwaitq, "NFS UOC"); return server; + +free_server: + kfree(server); + return NULL; } EXPORT_SYMBOL_GPL(nfs_alloc_server); diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 122fb3f14ffb..9546d2195c25 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -173,6 +173,7 @@ int nfs4_check_delegation(struct inode *inode, fmode_t type) static int nfs_delegation_claim_locks(struct nfs4_state *state, const nfs4_stateid *stateid) { struct inode *inode = state->inode; + struct nfs_inode *nfsi = NFS_I(inode); struct file_lock *fl; struct file_lock_context *flctx = locks_inode_context(inode); struct list_head *list; @@ -182,6 +183,9 @@ static int nfs_delegation_claim_locks(struct nfs4_state *state, const nfs4_state goto out; list = &flctx->flc_posix; + + /* Guard against reclaim and new lock/unlock calls */ + down_write(&nfsi->rwsem); spin_lock(&flctx->flc_lock); restart: for_each_file_lock(fl, list) { @@ -189,8 +193,10 @@ restart: continue; spin_unlock(&flctx->flc_lock); status = nfs4_lock_delegation_recall(fl, state, stateid); - if (status < 0) + if (status < 0) { + up_write(&nfsi->rwsem); goto out; + } spin_lock(&flctx->flc_lock); } if (list == &flctx->flc_posix) { @@ -198,6 +204,7 @@ restart: goto restart; } spin_unlock(&flctx->flc_lock); + up_write(&nfsi->rwsem); out: return status; } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 2f5f26f93238..c7b723c18620 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -650,7 +650,7 @@ int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) return 0; nfsi = NFS_I(inode); - if (entry->fattr->fileid != nfsi->fileid) + if (entry->fattr->fileid != inode->i_ino) return 0; if (entry->fh->size && nfs_compare_fh(entry->fh, &nfsi->fh) != 0) return 0; @@ -1105,7 +1105,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, ent = &array->array[i]; if (!dir_emit(desc->ctx, ent->name, ent->name_len, - nfs_compat_user_ino64(ent->ino), ent->d_type)) { + ent->ino, ent->d_type)) { desc->eob = true; break; } @@ -2301,7 +2301,7 @@ full_reval: return nfs_do_lookup_revalidate(dir, name, dentry, flags); } -#endif /* CONFIG_NFSV4 */ +#endif /* CONFIG_NFS_V4 */ int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry, struct file *file, unsigned int open_flags, diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 48d89716193a..e626c72495e6 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -466,14 +466,22 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, goto out_release; } dreq->l_ctx = l_ctx; - if (!is_sync_kiocb(iocb)) + if (!is_sync_kiocb(iocb)) { dreq->iocb = iocb; + } else if (iocb->ki_flags & IOCB_NOWAIT) { + result = -EAGAIN; + nfs_direct_req_release(dreq); + goto out_release; + } if (user_backed_iter(iter)) dreq->flags = NFS_ODIRECT_SHOULD_DIRTY; if (!swap) { - result = nfs_start_io_direct(inode); + if (iocb->ki_flags & IOCB_NOWAIT) + result = nfs_start_io_direct_nowait(inode); + else + result = nfs_start_io_direct(inode); if (result) { /* release the reference that would usually be * consumed by nfs_direct_read_schedule_iovec() diff --git a/fs/nfs/export.c b/fs/nfs/export.c index a10dd5f9d078..8fb08bce0623 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -49,14 +49,14 @@ nfs_encode_fh(struct inode *inode, __u32 *p, int *max_len, struct inode *parent) return FILEID_INVALID; } - p[FILEID_HIGH_OFF] = NFS_FILEID(inode) >> 32; - p[FILEID_LOW_OFF] = NFS_FILEID(inode); + p[FILEID_HIGH_OFF] = inode->i_ino >> 32; + p[FILEID_LOW_OFF] = inode->i_ino; p[FILE_I_TYPE_OFF] = inode->i_mode & S_IFMT; p[len - 1] = 0; /* Padding */ nfs_copy_fh(clnt_fh, server_fh); *max_len = len; dprintk("%s: result fh fileid %llu mode %u size %d\n", - __func__, NFS_FILEID(inode), inode->i_mode, *max_len); + __func__, inode->i_ino, inode->i_mode, *max_len); return *max_len; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 25048a3c2364..a0d8f1c1cf10 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -72,8 +72,12 @@ nfs_file_open(struct inode *inode, struct file *filp) return res; res = nfs_open(inode, filp); - if (res == 0) + if (res == 0) { filp->f_mode |= FMODE_CAN_ODIRECT; + /* flag NOWAIT on read-only files only */ + if (!(filp->f_mode & FMODE_WRITE)) + filp->f_mode |= FMODE_NOWAIT; + } return res; } @@ -166,6 +170,10 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) if (iocb->ki_flags & IOCB_DIRECT) return nfs_file_direct_read(iocb, to, false); + /* NOWAIT only supported on direct reads */ + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + dprintk("NFS: read(%pD2, %zu@%lu)\n", iocb->ki_filp, iov_iter_count(to), (unsigned long) iocb->ki_pos); @@ -705,6 +713,12 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) trace_nfs_file_write(iocb, from); + /* + * FMODE_NOWAIT is not set for writable files + */ + if (WARN_ON_ONCE(iocb->ki_flags & IOCB_NOWAIT)) + return -EAGAIN; + result = nfs_key_timeout_notify(file, inode); if (result) return result; diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index e85380e3b11d..72e20b56fbc7 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -95,7 +95,7 @@ static void filelayout_reset_write(struct nfs_pgio_header *hdr) "(req %s/%llu, %u bytes @ offset %llu)\n", __func__, hdr->task.tk_pid, hdr->inode->i_sb->s_id, - (unsigned long long)NFS_FILEID(hdr->inode), + (unsigned long long)hdr->inode->i_ino, hdr->args.count, (unsigned long long)hdr->args.offset); @@ -112,7 +112,7 @@ static void filelayout_reset_read(struct nfs_pgio_header *hdr) "(req %s/%llu, %u bytes @ offset %llu)\n", __func__, hdr->task.tk_pid, hdr->inode->i_sb->s_id, - (unsigned long long)NFS_FILEID(hdr->inode), + (unsigned long long)hdr->inode->i_ino, hdr->args.count, (unsigned long long)hdr->args.offset); @@ -778,6 +778,8 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, static bool filelayout_lseg_is_striped(const struct nfs4_filelayout_segment *flseg) { + if (flseg->dsaddr) + return flseg->dsaddr->stripe_count > 1; return flseg->num_fh > 1; } diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 8b1559171fe3..c4aa995026f6 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -551,6 +551,10 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, if (!p) goto out_err_free; fh_count = be32_to_cpup(p); + if (fh_count == 0) { + rc = -EINVAL; + goto out_err_free; + } dss_info->fh_versions = kzalloc_objs(struct nfs_fh, fh_count, gfp_flags); @@ -632,6 +636,9 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, if (!p) goto out_sort_mirrors; fls->flags = be32_to_cpup(p); + if (fls->flags & FF_FLAGS_NO_IO_THRU_MDS) + set_bit(NFS4_FF_HDR_NO_IO_THRU_MDS, + &FF_LAYOUT_FROM_HDR(lh)->flags); p = xdr_inline_decode(&stream, 4); if (!p) @@ -1181,6 +1188,16 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio, 0, NFS4_MAX_UINT64, IOMODE_RW, NFS_I(pgio->pg_inode)->layout, pgio->pg_lseg); + if (NFS_I(pgio->pg_inode)->layout && + ff_layout_hdr_no_fallback_to_mds(NFS_I(pgio->pg_inode)->layout)) { + /* + * FF_FLAGS_NO_IO_THRU_MDS: no current lseg but the server's + * policy forbids MDS fallback. Surface -EAGAIN so writeback + * retries rather than silently issuing the WRITE via MDS. + */ + pgio->pg_error = -EAGAIN; + goto out; + } /* no lseg means that pnfs is not in use, so no mirroring here */ nfs_pageio_reset_write_mds(pgio); out: @@ -1230,7 +1247,7 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs) "(req %s/%llu, %u bytes @ offset %llu)\n", __func__, hdr->task.tk_pid, hdr->inode->i_sb->s_id, - (unsigned long long)NFS_FILEID(hdr->inode), + (unsigned long long)hdr->inode->i_ino, hdr->args.count, (unsigned long long)hdr->args.offset); @@ -1243,7 +1260,7 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs) "(req %s/%llu, %u bytes @ offset %llu)\n", __func__, hdr->task.tk_pid, hdr->inode->i_sb->s_id, - (unsigned long long)NFS_FILEID(hdr->inode), + (unsigned long long)hdr->inode->i_ino, hdr->args.count, (unsigned long long)hdr->args.offset); @@ -1283,7 +1300,7 @@ static void ff_layout_reset_read(struct nfs_pgio_header *hdr) "(req %s/%llu, %u bytes @ offset %llu)\n", __func__, hdr->task.tk_pid, hdr->inode->i_sb->s_id, - (unsigned long long)NFS_FILEID(hdr->inode), + (unsigned long long)hdr->inode->i_ino, hdr->args.count, (unsigned long long)hdr->args.offset); @@ -2200,6 +2217,14 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) out_failed: if (ff_layout_avoid_mds_available_ds(lseg) && !ds_fatal_error) return PNFS_TRY_AGAIN; + if (ff_layout_no_fallback_to_mds(lseg)) { + /* + * FF_FLAGS_NO_IO_THRU_MDS: force fresh LAYOUTGET, + * never fall through to MDS I/O. + */ + pnfs_error_mark_layout_for_return(hdr->inode, lseg); + return PNFS_TRY_AGAIN; + } trace_pnfs_mds_fallback_read_pagelist(hdr->inode, hdr->args.offset, hdr->args.count, IOMODE_READ, NFS_I(hdr->inode)->layout, lseg); @@ -2285,6 +2310,14 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) out_failed: if (ff_layout_avoid_mds_available_ds(lseg) && !ds_fatal_error) return PNFS_TRY_AGAIN; + if (ff_layout_no_fallback_to_mds(lseg)) { + /* + * FF_FLAGS_NO_IO_THRU_MDS: force fresh LAYOUTGET, + * never fall through to MDS I/O. + */ + pnfs_error_mark_layout_for_return(hdr->inode, lseg); + return PNFS_TRY_AGAIN; + } trace_pnfs_mds_fallback_write_pagelist(hdr->inode, hdr->args.offset, hdr->args.count, IOMODE_RW, NFS_I(hdr->inode)->layout, lseg); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index 17a008c8e97c..a5bd00f69e82 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -112,12 +112,16 @@ struct nfs4_ff_layout_segment { struct nfs4_ff_layout_mirror *mirror_array[] __counted_by(mirror_array_cnt); }; +/* nfs4_flexfile_layout::flags bit indices */ +#define NFS4_FF_HDR_NO_IO_THRU_MDS 0 /* any lseg has had FF_FLAGS_NO_IO_THRU_MDS */ + struct nfs4_flexfile_layout { struct pnfs_layout_hdr generic_hdr; struct pnfs_ds_commit_info commit_info; struct list_head mirrors; struct list_head error_list; /* nfs4_ff_layout_ds_err */ ktime_t last_report_time; /* Layoutstat report times */ + unsigned long flags; }; struct nfs4_flexfile_layoutreturn_args { @@ -184,6 +188,18 @@ ff_layout_no_fallback_to_mds(struct pnfs_layout_segment *lseg) return FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_IO_THRU_MDS; } +/* + * Sticky hdr-level mirror of FF_FLAGS_NO_IO_THRU_MDS so callers that have + * no current lseg (e.g. between LAYOUTRETURN and the next LAYOUTGET) can + * still honor the no-MDS-fallback policy. + */ +static inline bool +ff_layout_hdr_no_fallback_to_mds(struct pnfs_layout_hdr *lo) +{ + return test_bit(NFS4_FF_HDR_NO_IO_THRU_MDS, + &FF_LAYOUT_FROM_HDR(lo)->flags); +} + static inline bool ff_layout_no_read_on_rw(struct pnfs_layout_segment *lseg) { diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 6227df9ae6f1..5bcd4027d203 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -58,21 +58,23 @@ #define NFSDBG_FACILITY NFSDBG_VFS -#define NFS_64_BIT_INODE_NUMBERS_ENABLED 1 +static bool enable_ino64; -/* Default is to see 64-bit inode numbers */ -static bool enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED; +static int param_set_enable_ino64(const char *val, const struct kernel_param *kp) +{ + pr_notice("enable_ino64 is deprecated and has no effect\n"); + return 0; +} + +static const struct kernel_param_ops param_ops_enable_ino64 = { + .set = param_set_enable_ino64, + .get = param_get_bool, +}; static int nfs_update_inode(struct inode *, struct nfs_fattr *); static struct kmem_cache * nfs_inode_cachep; -static inline unsigned long -nfs_fattr_to_ino_t(struct nfs_fattr *fattr) -{ - return nfs_fileid_to_ino_t(fattr->fileid); -} - int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) { if (unlikely(nfs_current_task_exiting())) @@ -84,29 +86,6 @@ int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) } EXPORT_SYMBOL_GPL(nfs_wait_bit_killable); -/** - * nfs_compat_user_ino64 - returns the user-visible inode number - * @fileid: 64-bit fileid - * - * This function returns a 32-bit inode number if the boot parameter - * nfs.enable_ino64 is zero. - */ -u64 nfs_compat_user_ino64(u64 fileid) -{ -#ifdef CONFIG_COMPAT - compat_ulong_t ino; -#else - unsigned long ino; -#endif - - if (enable_ino64) - return fileid; - ino = fileid; - if (sizeof(ino) < sizeof(fileid)) - ino ^= fileid >> (sizeof(fileid)-sizeof(ino)) * 8; - return ino; -} - int nfs_drop_inode(struct inode *inode) { return NFS_STALE(inode) || inode_generic_drop(inode); @@ -314,8 +293,7 @@ struct nfs_find_desc { }; /* - * In NFSv3 we can have 64bit inode numbers. In order to support - * this, and re-exported directories (also seen in NFSv2) + * For re-exported directories (also seen in NFSv2) * we are forced to allow 2 different inodes to have the same * i_ino. */ @@ -326,7 +304,7 @@ nfs_find_actor(struct inode *inode, void *opaque) struct nfs_fh *fh = desc->fh; struct nfs_fattr *fattr = desc->fattr; - if (NFS_FILEID(inode) != fattr->fileid) + if (inode->i_ino != fattr->fileid) return 0; if (inode_wrong_type(inode, fattr->mode)) return 0; @@ -343,7 +321,7 @@ nfs_init_locked(struct inode *inode, void *opaque) struct nfs_find_desc *desc = opaque; struct nfs_fattr *fattr = desc->fattr; - set_nfs_fileid(inode, fattr->fileid); + inode->i_ino = fattr->fileid; inode->i_mode = fattr->mode; nfs_copy_fh(NFS_FH(inode), desc->fh); return 0; @@ -414,13 +392,13 @@ nfs_ilookup(struct super_block *sb, struct nfs_fattr *fattr, struct nfs_fh *fh) .fattr = fattr, }; struct inode *inode; - unsigned long hash; + u64 hash; if (!(fattr->valid & NFS_ATTR_FATTR_FILEID) || !(fattr->valid & NFS_ATTR_FATTR_TYPE)) return NULL; - hash = nfs_fattr_to_ino_t(fattr); + hash = fattr->fileid; inode = ilookup5(sb, hash, nfs_find_actor, &desc); dprintk("%s: returning %p\n", __func__, inode); @@ -457,7 +435,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) }; struct inode *inode = ERR_PTR(-ENOENT); u64 fattr_supported = NFS_SB(sb)->fattr_valid; - unsigned long hash; + u64 hash; nfs_attr_check_mountpoint(sb, fattr); @@ -468,7 +446,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0) goto out_no_inode; - hash = nfs_fattr_to_ino_t(fattr); + hash = fattr->fileid; inode = iget5_locked(sb, hash, nfs_find_actor, nfs_init_locked, &desc); if (inode == NULL) { @@ -480,10 +458,6 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) struct nfs_inode *nfsi = NFS_I(inode); unsigned long now = jiffies; - /* We set i_ino for the few things that still rely on it, - * such as stat(2) */ - inode->i_ino = hash; - /* We can't support update_atime(), since the server will reset it */ inode->i_flags |= S_NOATIME|S_NOCMTIME; inode->i_mode = fattr->mode; @@ -607,7 +581,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) } dprintk("NFS: nfs_fhget(%s/%Lu fh_crc=0x%08x ct=%d)\n", inode->i_sb->s_id, - (unsigned long long)NFS_FILEID(inode), + (unsigned long long)inode->i_ino, nfs_display_fhandle_hash(fh), icount_read_once(inode)); @@ -1067,7 +1041,6 @@ out_no_revalidate: stat->result_mask = nfs_get_valid_attrmask(inode) | request_mask; generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); - stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); stat->change_cookie = inode_peek_iversion_raw(inode); stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC; if (server->change_attr_type != NFS4_CHANGE_TYPE_IS_UNDEFINED) @@ -1385,7 +1358,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) struct nfs_inode *nfsi = NFS_I(inode); dfprintk(PAGECACHE, "NFS: revalidating (%s/%Lu)\n", - inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode)); + inode->i_sb->s_id, (unsigned long long)inode->i_ino); trace_nfs_revalidate_inode_enter(inode); @@ -1399,7 +1372,8 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) status = pnfs_sync_inode(inode, false); if (status) goto out; - } else if (nfs_have_directory_delegation(inode)) { + } else if (nfs_have_directory_delegation(inode) && + !(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR)) { status = 0; goto out; } @@ -1415,7 +1389,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) if (status != 0) { dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) getattr failed, error=%d\n", inode->i_sb->s_id, - (unsigned long long)NFS_FILEID(inode), status); + (unsigned long long)inode->i_ino, status); switch (status) { case -ETIMEDOUT: /* A soft timeout occurred. Use cached information? */ @@ -1435,7 +1409,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) if (status) { dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) refresh failed, error=%d\n", inode->i_sb->s_id, - (unsigned long long)NFS_FILEID(inode), status); + (unsigned long long)inode->i_ino, status); goto out; } @@ -1446,7 +1420,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) dfprintk(PAGECACHE, "NFS: (%s/%Lu) revalidation complete\n", inode->i_sb->s_id, - (unsigned long long)NFS_FILEID(inode)); + (unsigned long long)inode->i_ino); out: nfs_free_fattr(fattr); @@ -1495,7 +1469,7 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map dfprintk(PAGECACHE, "NFS: (%s/%Lu) data cache invalidated\n", inode->i_sb->s_id, - (unsigned long long)NFS_FILEID(inode)); + (unsigned long long)inode->i_ino); return 0; } @@ -1687,10 +1661,10 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) return 0; /* Has the inode gone and changed behind our back? */ - } else if (nfsi->fileid != fattr->fileid) { + } else if (inode->i_ino != fattr->fileid) { /* Is this perhaps the mounted-on fileid? */ if ((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) && - nfsi->fileid == fattr->mounted_on_fileid) + inode->i_ino == fattr->mounted_on_fileid) return 0; return -ESTALE; } @@ -2277,15 +2251,15 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) return 0; /* Has the inode gone and changed behind our back? */ - } else if (nfsi->fileid != fattr->fileid) { + } else if (inode->i_ino != fattr->fileid) { /* Is this perhaps the mounted-on fileid? */ if ((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) && - nfsi->fileid == fattr->mounted_on_fileid) + inode->i_ino == fattr->mounted_on_fileid) return 0; printk(KERN_ERR "NFS: server %s error: fileid changed\n" "fsid %s: expected fileid 0x%Lx, got 0x%Lx\n", NFS_SERVER(inode)->nfs_client->cl_hostname, - inode->i_sb->s_id, (long long)nfsi->fileid, + inode->i_sb->s_id, (long long)inode->i_ino, (long long)fattr->fileid); goto out_err; } @@ -2813,7 +2787,7 @@ static void __exit exit_nfs_fs(void) MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>"); MODULE_DESCRIPTION("NFS client support"); MODULE_LICENSE("GPL"); -module_param(enable_ino64, bool, 0644); +module_param_cb(enable_ino64, ¶m_ops_enable_ino64, &enable_ino64, 0644); module_init(init_nfs_fs) module_exit(exit_nfs_fs) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index ec2b3d984398..acaeff7ddfdf 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -535,6 +535,7 @@ extern void nfs_end_io_read(struct inode *inode); extern __must_check int nfs_start_io_write(struct inode *inode); extern void nfs_end_io_write(struct inode *inode); extern __must_check int nfs_start_io_direct(struct inode *inode); +extern __must_check int nfs_start_io_direct_nowait(struct inode *inode); extern void nfs_end_io_direct(struct inode *inode); static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi) diff --git a/fs/nfs/io.c b/fs/nfs/io.c index 8337f0ae852d..2faf2003faf6 100644 --- a/fs/nfs/io.c +++ b/fs/nfs/io.c @@ -109,6 +109,16 @@ static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode) } } +static int nfs_block_buffered_nowait(struct nfs_inode *nfsi, struct inode *inode) +{ + if (!test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { + if (inode->i_mapping->nrpages != 0) + return 1; + set_bit(NFS_INO_ODIRECT, &nfsi->flags); + } + return 0; +} + /** * nfs_start_io_direct - declare the file is being used for direct i/o * @inode: file inode @@ -150,6 +160,37 @@ nfs_start_io_direct(struct inode *inode) } /** + * nfs_start_io_direct_nowait - non-blocking variant of nfs_start_io_direct() + * @inode: file inode + * + * Try to declare that a direct I/O operation is about to start without + * blocking. + * Ensure all buffered I/O is blocked. + * If this could not be done without blocking then returns -EAGAIN. + */ +int +nfs_start_io_direct_nowait(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + if (!down_read_trylock(&inode->i_rwsem)) + return -EAGAIN; + if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) + return 0; + up_read(&inode->i_rwsem); + + /* Slow path: try to flip NFS_INO_ODIRECT without blocking. */ + if (!down_write_trylock(&inode->i_rwsem)) + return -EAGAIN; + if (nfs_block_buffered_nowait(nfsi, inode)) { + up_write(&inode->i_rwsem); + return -EAGAIN; + } + downgrade_write(&inode->i_rwsem); + return 0; +} + +/** * nfs_end_io_direct - declare that the direct i/o operation is done * @inode: file inode * diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 7602ede6f75f..ab86246fc364 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -81,12 +81,17 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, status = nfs4_call_sync(server->client, server, msg, &args.seq_args, &res.seq_res, 0); if (status == 0) { - if (nfs_should_remove_suid(inode)) { - spin_lock(&inode->i_lock); + loff_t newsize = offset + len; + + spin_lock(&inode->i_lock); + if (newsize > i_size_read(inode)) + i_size_write(inode, newsize); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_BLOCKS); + if (nfs_should_remove_suid(inode)) nfs_set_cache_invalid(inode, - NFS_INO_REVAL_FORCED | NFS_INO_INVALID_MODE); - spin_unlock(&inode->i_lock); - } + NFS_INO_REVAL_FORCED | + NFS_INO_INVALID_MODE); + spin_unlock(&inode->i_lock); status = nfs_post_op_update_inode_force_wcc(inode, res.falloc_fattr); } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c48281db3868..1360409d8de9 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -377,7 +377,7 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent *p++ = htonl(attrs); /* bitmap */ *p++ = htonl(12); /* attribute buffer length */ *p++ = htonl(NF4DIR); - p = xdr_encode_hyper(p, NFS_FILEID(d_inode(dentry))); + p = xdr_encode_hyper(p, d_inode(dentry)->i_ino); } *p++ = xdr_one; /* next */ @@ -391,7 +391,7 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent *p++ = htonl(12); /* attribute buffer length */ *p++ = htonl(NF4DIR); spin_lock(&dentry->d_lock); - p = xdr_encode_hyper(p, NFS_FILEID(d_inode(dentry->d_parent))); + p = xdr_encode_hyper(p, d_inode(dentry->d_parent)->i_ino); spin_unlock(&dentry->d_lock); readdir->pgbase = (char *)p - (char *)start; @@ -5304,10 +5304,9 @@ static struct dentry *nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, do { alias = _nfs4_proc_mkdir(dir, dentry, sattr, label, &err); trace_nfs4_mkdir(dir, &dentry->d_name, err); + err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception); if (err) - alias = ERR_PTR(nfs4_handle_exception(NFS_SERVER(dir), - err, - &exception)); + alias = ERR_PTR(err); } while (exception.retry); nfs4_label_release_security(label); @@ -7087,7 +7086,6 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) switch (task->tk_status) { case 0: renew_lease(calldata->server, calldata->timestamp); - locks_lock_inode_wait(calldata->lsp->ls_state->inode, &calldata->fl); if (nfs4_update_lock_stateid(calldata->lsp, &calldata->res.stateid)) break; @@ -7355,11 +7353,6 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) case 0: renew_lease(NFS_SERVER(d_inode(data->ctx->dentry)), data->timestamp); - if (data->arg.new_lock && !data->cancelled) { - data->fl.c.flc_flags &= ~(FL_SLEEP | FL_ACCESS); - if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0) - goto out_restart; - } if (data->arg.new_lock_owner != 0) { nfs_confirm_seqid(&lsp->ls_seqid, 0); nfs4_stateid_copy(&lsp->ls_stateid, &data->res.stateid); @@ -7470,11 +7463,10 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f msg.rpc_argp = &data->arg; msg.rpc_resp = &data->res; task_setup_data.callback_data = data; - if (recovery_type > NFS_LOCK_NEW) { - if (recovery_type == NFS_LOCK_RECLAIM) - data->arg.reclaim = NFS_LOCK_RECLAIM; - } else - data->arg.new_lock = 1; + + if (recovery_type == NFS_LOCK_RECLAIM) + data->arg.reclaim = NFS_LOCK_RECLAIM; + task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); @@ -7584,6 +7576,13 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock up_read(&nfsi->rwsem); mutex_unlock(&sp->so_delegreturn_mutex); status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW); + if (status) + goto out; + + down_read(&nfsi->rwsem); + request->c.flc_flags &= ~(FL_SLEEP | FL_ACCESS); + status = locks_lock_inode_wait(state->inode, request); + up_read(&nfsi->rwsem); out: request->c.flc_flags = flags; return status; @@ -9991,6 +9990,38 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) case -NFS4ERR_GRACE: /* loca_recalim always false */ task->tk_status = 0; break; + case -NFS4ERR_OLD_STATEID: { + u32 old_seqid = be32_to_cpu(data->args.stateid.seqid); + struct pnfs_layout_range range = { + .iomode = IOMODE_ANY, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + + if (nfs4_layout_refresh_old_stateid(&data->args.stateid, + &range, + data->args.inode)) { + struct pnfs_layout_hdr *lo; + + spin_lock(&data->args.inode->i_lock); + lo = NFS_I(data->args.inode)->layout; + if (lo && pnfs_layout_is_valid(lo) && + nfs4_stateid_match_other(&data->args.stateid, + &lo->plh_stateid)) + pnfs_set_layout_stateid(lo, &data->args.stateid, + NULL, false); + spin_unlock(&data->args.inode->i_lock); + + dprintk("%s: refreshed OLD_STATEID inode %llu seq %u->%u\n", + __func__, data->args.inode->i_ino, + old_seqid, + be32_to_cpu(data->args.stateid.seqid)); + + rpc_restart_call_prepare(task); + return; + } + fallthrough; + } case 0: break; default: diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index c939533b9881..1ed677810d9d 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -597,13 +597,13 @@ DECLARE_EVENT_CLASS(nfs4_open_event, __entry->openstateid_hash = 0; } if (inode != NULL) { - __entry->fileid = NFS_FILEID(inode); + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); } else { __entry->fileid = 0; __entry->fhandle = 0; } - __entry->dir = NFS_FILEID(d_inode(ctx->dentry->d_parent)); + __entry->dir = d_inode(ctx->dentry->d_parent)->i_ino; __assign_str(name); ), @@ -658,7 +658,7 @@ TRACE_EVENT(nfs4_cached_open, const struct inode *inode = state->inode; __entry->dev = inode->i_sb->s_dev; - __entry->fileid = NFS_FILEID(inode); + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); __entry->fmode = (__force unsigned int)state->state; __entry->stateid_seq = @@ -703,7 +703,7 @@ TRACE_EVENT(nfs4_close, const struct inode *inode = state->inode; __entry->dev = inode->i_sb->s_dev; - __entry->fileid = NFS_FILEID(inode); + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); __entry->fmode = (__force unsigned int)state->state; __entry->error = error < 0 ? -error : 0; @@ -759,7 +759,7 @@ DECLARE_EVENT_CLASS(nfs4_lock_event, __entry->start = request->fl_start; __entry->end = request->fl_end; __entry->dev = inode->i_sb->s_dev; - __entry->fileid = NFS_FILEID(inode); + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); __entry->stateid_seq = be32_to_cpu(state->stateid.seqid); @@ -831,7 +831,7 @@ TRACE_EVENT(nfs4_set_lock, __entry->start = request->fl_start; __entry->end = request->fl_end; __entry->dev = inode->i_sb->s_dev; - __entry->fileid = NFS_FILEID(inode); + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); __entry->stateid_seq = be32_to_cpu(state->stateid.seqid); @@ -922,7 +922,7 @@ TRACE_EVENT(nfs4_state_lock_reclaim, const struct inode *inode = state->inode; __entry->dev = inode->i_sb->s_dev; - __entry->fileid = NFS_FILEID(inode); + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); __entry->state_flags = state->flags; __entry->lock_flags = lock->ls_flags; @@ -960,7 +960,7 @@ DECLARE_EVENT_CLASS(nfs4_set_delegation_event, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; - __entry->fileid = NFS_FILEID(inode); + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); __entry->fmode = (__force unsigned int)fmode; ), @@ -1087,7 +1087,7 @@ DECLARE_EVENT_CLASS(nfs4_test_stateid_event, __entry->error = error < 0 ? -error : 0; __entry->dev = inode->i_sb->s_dev; - __entry->fileid = NFS_FILEID(inode); + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); __entry->stateid_seq = be32_to_cpu(state->stateid.seqid); @@ -1137,7 +1137,7 @@ DECLARE_EVENT_CLASS(nfs4_lookup_event, TP_fast_assign( __entry->dev = dir->i_sb->s_dev; - __entry->dir = NFS_FILEID(dir); + __entry->dir = dir->i_ino; __entry->error = -error; __assign_str(name); ), @@ -1185,7 +1185,7 @@ TRACE_EVENT(nfs4_lookupp, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; - __entry->ino = NFS_FILEID(inode); + __entry->ino = inode->i_ino; __entry->error = error < 0 ? -error : 0; ), @@ -1220,8 +1220,8 @@ TRACE_EVENT(nfs4_rename, TP_fast_assign( __entry->dev = olddir->i_sb->s_dev; - __entry->olddir = NFS_FILEID(olddir); - __entry->newdir = NFS_FILEID(newdir); + __entry->olddir = olddir->i_ino; + __entry->newdir = newdir->i_ino; __entry->error = error < 0 ? -error : 0; __assign_str(oldname); __assign_str(newname); @@ -1258,7 +1258,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_event, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; - __entry->fileid = NFS_FILEID(inode); + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); __entry->error = error < 0 ? -error : 0; ), @@ -1311,7 +1311,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_event, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; - __entry->fileid = NFS_FILEID(inode); + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); __entry->error = error < 0 ? -error : 0; __entry->stateid_seq = @@ -1421,7 +1421,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event, __entry->error = error < 0 ? -error : 0; __entry->fhandle = nfs_fhandle_hash(fhandle); if (!IS_ERR_OR_NULL(inode)) { - __entry->fileid = NFS_FILEID(inode); + __entry->fileid = inode->i_ino; __entry->dev = inode->i_sb->s_dev; } else { __entry->fileid = 0; @@ -1478,7 +1478,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event, __entry->error = error < 0 ? -error : 0; __entry->fhandle = nfs_fhandle_hash(fhandle); if (!IS_ERR_OR_NULL(inode)) { - __entry->fileid = NFS_FILEID(inode); + __entry->fileid = inode->i_ino; __entry->dev = inode->i_sb->s_dev; } else { __entry->fileid = 0; @@ -1655,7 +1655,7 @@ DECLARE_EVENT_CLASS(nfs4_read_event, const struct pnfs_layout_segment *lseg = hdr->lseg; __entry->dev = inode->i_sb->s_dev; - __entry->fileid = nfsi->fileid; + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(fh); __entry->offset = hdr->args.offset; __entry->arg_count = hdr->args.count; @@ -1727,7 +1727,7 @@ DECLARE_EVENT_CLASS(nfs4_write_event, const struct pnfs_layout_segment *lseg = hdr->lseg; __entry->dev = inode->i_sb->s_dev; - __entry->fileid = nfsi->fileid; + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(fh); __entry->offset = hdr->args.offset; __entry->arg_count = hdr->args.count; @@ -1795,7 +1795,7 @@ DECLARE_EVENT_CLASS(nfs4_commit_event, const struct pnfs_layout_segment *lseg = data->lseg; __entry->dev = inode->i_sb->s_dev; - __entry->fileid = nfsi->fileid; + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(fh); __entry->offset = data->args.offset; __entry->count = data->args.count; @@ -1857,7 +1857,7 @@ TRACE_EVENT(nfs4_layoutget, const struct inode *inode = d_inode(ctx->dentry); const struct nfs4_state *state = ctx->state; __entry->dev = inode->i_sb->s_dev; - __entry->fileid = NFS_FILEID(inode); + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); __entry->iomode = args->iomode; __entry->offset = args->offset; @@ -1957,7 +1957,7 @@ TRACE_EVENT(pnfs_update_layout, ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; - __entry->fileid = NFS_FILEID(inode); + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); __entry->pos = pos; __entry->count = count; @@ -2012,7 +2012,7 @@ DECLARE_EVENT_CLASS(pnfs_layout_event, ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; - __entry->fileid = NFS_FILEID(inode); + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); __entry->pos = pos; __entry->count = count; @@ -2194,7 +2194,7 @@ DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event, __entry->error = -error; __entry->nfs_error = hdr->res.op_status; __entry->fhandle = nfs_fhandle_hash(hdr->args.fh); - __entry->fileid = NFS_FILEID(inode); + __entry->fileid = inode->i_ino; __entry->dev = inode->i_sb->s_dev; __entry->offset = hdr->args.offset; __entry->count = hdr->args.count; @@ -2258,7 +2258,7 @@ TRACE_EVENT(ff_layout_commit_error, __entry->error = -error; __entry->nfs_error = data->res.op_status; __entry->fhandle = nfs_fhandle_hash(data->args.fh); - __entry->fileid = NFS_FILEID(inode); + __entry->fileid = inode->i_ino; __entry->dev = inode->i_sb->s_dev; __entry->offset = data->args.offset; __entry->count = data->args.count; @@ -2423,7 +2423,7 @@ TRACE_EVENT(nfs4_llseek, TP_STRUCT__entry( __field(unsigned long, error) __field(u32, fhandle) - __field(u32, fileid) + __field(u64, fileid) __field(dev_t, dev) __field(int, stateid_seq) __field(u32, stateid_hash) @@ -2434,10 +2434,9 @@ TRACE_EVENT(nfs4_llseek, ), TP_fast_assign( - const struct nfs_inode *nfsi = NFS_I(inode); const struct nfs_fh *fh = args->sa_fh; - __entry->fileid = nfsi->fileid; + __entry->fileid = inode->i_ino; __entry->dev = inode->i_sb->s_dev; __entry->fhandle = nfs_fhandle_hash(fh); __entry->offset_s = args->sa_offset; @@ -2499,7 +2498,7 @@ DECLARE_EVENT_CLASS(nfs4_sparse_event, __entry->offset = args->falloc_offset; __entry->len = args->falloc_length; __entry->dev = inode->i_sb->s_dev; - __entry->fileid = NFS_FILEID(inode); + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); __entry->stateid_seq = be32_to_cpu(args->falloc_stateid.seqid); @@ -2568,14 +2567,11 @@ TRACE_EVENT(nfs4_copy, ), TP_fast_assign( - const struct nfs_inode *src_nfsi = NFS_I(src_inode); - const struct nfs_inode *dst_nfsi = NFS_I(dst_inode); - - __entry->src_fileid = src_nfsi->fileid; + __entry->src_fileid = src_inode->i_ino; __entry->src_dev = src_inode->i_sb->s_dev; __entry->src_fhandle = nfs_fhandle_hash(args->src_fh); __entry->src_offset = args->src_pos; - __entry->dst_fileid = dst_nfsi->fileid; + __entry->dst_fileid = dst_inode->i_ino; __entry->dst_dev = dst_inode->i_sb->s_dev; __entry->dst_fhandle = nfs_fhandle_hash(args->dst_fh); __entry->dst_offset = args->dst_pos; @@ -2666,14 +2662,11 @@ TRACE_EVENT(nfs4_clone, ), TP_fast_assign( - const struct nfs_inode *src_nfsi = NFS_I(src_inode); - const struct nfs_inode *dst_nfsi = NFS_I(dst_inode); - - __entry->src_fileid = src_nfsi->fileid; + __entry->src_fileid = src_inode->i_ino; __entry->src_dev = src_inode->i_sb->s_dev; __entry->src_fhandle = nfs_fhandle_hash(args->src_fh); __entry->src_offset = args->src_offset; - __entry->dst_fileid = dst_nfsi->fileid; + __entry->dst_fileid = dst_inode->i_ino; __entry->dst_dev = dst_inode->i_sb->s_dev; __entry->dst_fhandle = nfs_fhandle_hash(args->dst_fh); __entry->dst_offset = args->dst_offset; @@ -2724,7 +2717,7 @@ TRACE_EVENT(nfs4_copy_notify, TP_STRUCT__entry( __field(unsigned long, error) __field(u32, fhandle) - __field(u32, fileid) + __field(u64, fileid) __field(dev_t, dev) __field(int, stateid_seq) __field(u32, stateid_hash) @@ -2733,9 +2726,7 @@ TRACE_EVENT(nfs4_copy_notify, ), TP_fast_assign( - const struct nfs_inode *nfsi = NFS_I(inode); - - __entry->fileid = nfsi->fileid; + __entry->fileid = inode->i_ino; __entry->dev = inode->i_sb->s_dev; __entry->fhandle = nfs_fhandle_hash(args->cna_src_fh); __entry->stateid_seq = @@ -2830,7 +2821,7 @@ DECLARE_EVENT_CLASS(nfs4_xattr_event, TP_fast_assign( __entry->error = error < 0 ? -error : 0; __entry->dev = inode->i_sb->s_dev; - __entry->fileid = NFS_FILEID(inode); + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); __assign_str(name); ), diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index ff467959f733..4ada21f4eebd 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -80,7 +80,7 @@ DECLARE_EVENT_CLASS(nfs_inode_event, TP_fast_assign( const struct nfs_inode *nfsi = NFS_I(inode); __entry->dev = inode->i_sb->s_dev; - __entry->fileid = nfsi->fileid; + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->version = inode_peek_iversion_raw(inode); __entry->cache_validity = nfsi->cache_validity; @@ -121,7 +121,7 @@ DECLARE_EVENT_CLASS(nfs_inode_event_done, const struct nfs_inode *nfsi = NFS_I(inode); __entry->error = error < 0 ? -error : 0; __entry->dev = inode->i_sb->s_dev; - __entry->fileid = nfsi->fileid; + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->type = nfs_umode_to_dtype(inode->i_mode); __entry->version = inode_peek_iversion_raw(inode); @@ -211,7 +211,7 @@ TRACE_EVENT(nfs_access_exit, const struct nfs_inode *nfsi = NFS_I(inode); __entry->error = error < 0 ? -error : 0; __entry->dev = inode->i_sb->s_dev; - __entry->fileid = nfsi->fileid; + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->type = nfs_umode_to_dtype(inode->i_mode); __entry->version = inode_peek_iversion_raw(inode); @@ -265,7 +265,7 @@ DECLARE_EVENT_CLASS(nfs_update_size_class, __entry->dev = inode->i_sb->s_dev; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); - __entry->fileid = nfsi->fileid; + __entry->fileid = inode->i_ino; __entry->version = inode_peek_iversion_raw(inode); __entry->cur_size = i_size_read(inode); __entry->new_size = new_size; @@ -317,7 +317,7 @@ DECLARE_EVENT_CLASS(nfs_inode_range_event, __entry->dev = inode->i_sb->s_dev; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); - __entry->fileid = nfsi->fileid; + __entry->fileid = inode->i_ino; __entry->version = inode_peek_iversion_raw(inode); __entry->range_start = range_start; __entry->range_end = range_end; @@ -371,7 +371,7 @@ DECLARE_EVENT_CLASS(nfs_readdir_event, const struct nfs_inode *nfsi = NFS_I(dir); __entry->dev = dir->i_sb->s_dev; - __entry->fileid = nfsi->fileid; + __entry->fileid = dir->i_ino; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->version = inode_peek_iversion_raw(dir); if (cookie != 0) @@ -429,9 +429,9 @@ DECLARE_EVENT_CLASS(nfs_lookup_event, TP_fast_assign( __entry->dev = dir->i_sb->s_dev; - __entry->dir = NFS_FILEID(dir); + __entry->dir = dir->i_ino; __entry->flags = flags; - __entry->fileid = d_is_negative(dentry) ? 0 : NFS_FILEID(d_inode(dentry)); + __entry->fileid = d_is_negative(dentry) ? 0 : d_inode(dentry)->i_ino; __assign_str(name); ), @@ -476,10 +476,10 @@ DECLARE_EVENT_CLASS(nfs_lookup_event_done, TP_fast_assign( __entry->dev = dir->i_sb->s_dev; - __entry->dir = NFS_FILEID(dir); + __entry->dir = dir->i_ino; __entry->error = error < 0 ? -error : 0; __entry->flags = flags; - __entry->fileid = d_is_negative(dentry) ? 0 : NFS_FILEID(d_inode(dentry)); + __entry->fileid = d_is_negative(dentry) ? 0 : d_inode(dentry)->i_ino; __assign_str(name); ), @@ -532,7 +532,7 @@ TRACE_EVENT(nfs_atomic_open_enter, TP_fast_assign( __entry->dev = dir->i_sb->s_dev; - __entry->dir = NFS_FILEID(dir); + __entry->dir = dir->i_ino; __entry->flags = flags; __entry->fmode = (__force unsigned long)ctx->mode; __assign_str(name); @@ -571,7 +571,7 @@ TRACE_EVENT(nfs_atomic_open_exit, TP_fast_assign( __entry->error = -error; __entry->dev = dir->i_sb->s_dev; - __entry->dir = NFS_FILEID(dir); + __entry->dir = dir->i_ino; __entry->flags = flags; __entry->fmode = (__force unsigned long)ctx->mode; __assign_str(name); @@ -608,7 +608,7 @@ TRACE_EVENT(nfs_create_enter, TP_fast_assign( __entry->dev = dir->i_sb->s_dev; - __entry->dir = NFS_FILEID(dir); + __entry->dir = dir->i_ino; __entry->flags = flags; __assign_str(name); ), @@ -644,7 +644,7 @@ TRACE_EVENT(nfs_create_exit, TP_fast_assign( __entry->error = -error; __entry->dev = dir->i_sb->s_dev; - __entry->dir = NFS_FILEID(dir); + __entry->dir = dir->i_ino; __entry->flags = flags; __assign_str(name); ), @@ -676,7 +676,7 @@ DECLARE_EVENT_CLASS(nfs_directory_event, TP_fast_assign( __entry->dev = dir->i_sb->s_dev; - __entry->dir = NFS_FILEID(dir); + __entry->dir = dir->i_ino; __assign_str(name); ), @@ -714,7 +714,7 @@ DECLARE_EVENT_CLASS(nfs_directory_event_done, TP_fast_assign( __entry->dev = dir->i_sb->s_dev; - __entry->dir = NFS_FILEID(dir); + __entry->dir = dir->i_ino; __entry->error = error < 0 ? -error : 0; __assign_str(name); ), @@ -768,8 +768,8 @@ TRACE_EVENT(nfs_link_enter, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; - __entry->fileid = NFS_FILEID(inode); - __entry->dir = NFS_FILEID(dir); + __entry->fileid = inode->i_ino; + __entry->dir = dir->i_ino; __assign_str(name); ), @@ -803,8 +803,8 @@ TRACE_EVENT(nfs_link_exit, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; - __entry->fileid = NFS_FILEID(inode); - __entry->dir = NFS_FILEID(dir); + __entry->fileid = inode->i_ino; + __entry->dir = dir->i_ino; __entry->error = error < 0 ? -error : 0; __assign_str(name); ), @@ -840,8 +840,8 @@ DECLARE_EVENT_CLASS(nfs_rename_event, TP_fast_assign( __entry->dev = old_dir->i_sb->s_dev; - __entry->old_dir = NFS_FILEID(old_dir); - __entry->new_dir = NFS_FILEID(new_dir); + __entry->old_dir = old_dir->i_ino; + __entry->new_dir = new_dir->i_ino; __assign_str(old_name); __assign_str(new_name); ), @@ -889,8 +889,8 @@ DECLARE_EVENT_CLASS(nfs_rename_event_done, TP_fast_assign( __entry->dev = old_dir->i_sb->s_dev; __entry->error = -error; - __entry->old_dir = NFS_FILEID(old_dir); - __entry->new_dir = NFS_FILEID(new_dir); + __entry->old_dir = old_dir->i_ino; + __entry->new_dir = new_dir->i_ino; __assign_str(old_name); __assign_str(new_name); ), @@ -943,7 +943,7 @@ TRACE_EVENT(nfs_sillyrename_unlink, struct inode *dir = d_inode(data->dentry->d_parent); size_t len = data->args.name.len; __entry->dev = dir->i_sb->s_dev; - __entry->dir = NFS_FILEID(dir); + __entry->dir = dir->i_ino; __entry->error = -error; memcpy(__get_str(name), data->args.name.name, len); @@ -981,7 +981,7 @@ DECLARE_EVENT_CLASS(nfs_folio_event, const struct nfs_inode *nfsi = NFS_I(inode); __entry->dev = inode->i_sb->s_dev; - __entry->fileid = nfsi->fileid; + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->version = inode_peek_iversion_raw(inode); __entry->offset = offset; @@ -1031,7 +1031,7 @@ DECLARE_EVENT_CLASS(nfs_folio_event_done, const struct nfs_inode *nfsi = NFS_I(inode); __entry->dev = inode->i_sb->s_dev; - __entry->fileid = nfsi->fileid; + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->version = inode_peek_iversion_raw(inode); __entry->offset = offset; @@ -1109,7 +1109,7 @@ DECLARE_EVENT_CLASS(nfs_kiocb_event, const struct nfs_inode *nfsi = NFS_I(inode); __entry->dev = inode->i_sb->s_dev; - __entry->fileid = nfsi->fileid; + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->version = inode_peek_iversion_raw(inode); __entry->offset = iocb->ki_pos; @@ -1160,7 +1160,7 @@ TRACE_EVENT(nfs_aop_readahead, const struct nfs_inode *nfsi = NFS_I(inode); __entry->dev = inode->i_sb->s_dev; - __entry->fileid = nfsi->fileid; + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->version = inode_peek_iversion_raw(inode); __entry->offset = pos; @@ -1199,7 +1199,7 @@ TRACE_EVENT(nfs_aop_readahead_done, const struct nfs_inode *nfsi = NFS_I(inode); __entry->dev = inode->i_sb->s_dev; - __entry->fileid = nfsi->fileid; + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->version = inode_peek_iversion_raw(inode); __entry->nr_pages = nr_pages; @@ -1239,7 +1239,7 @@ TRACE_EVENT(nfs_initiate_read, __entry->offset = hdr->args.offset; __entry->count = hdr->args.count; __entry->dev = inode->i_sb->s_dev; - __entry->fileid = nfsi->fileid; + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(fh); ), @@ -1284,7 +1284,7 @@ TRACE_EVENT(nfs_readpage_done, __entry->res_count = hdr->res.count; __entry->eof = hdr->res.eof; __entry->dev = inode->i_sb->s_dev; - __entry->fileid = nfsi->fileid; + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(fh); ), @@ -1330,7 +1330,7 @@ TRACE_EVENT(nfs_readpage_short, __entry->res_count = hdr->res.count; __entry->eof = hdr->res.eof; __entry->dev = inode->i_sb->s_dev; - __entry->fileid = nfsi->fileid; + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(fh); ), @@ -1377,7 +1377,7 @@ TRACE_EVENT(nfs_pgio_error, __entry->arg_count = hdr->args.count; __entry->res_count = hdr->res.count; __entry->dev = inode->i_sb->s_dev; - __entry->fileid = nfsi->fileid; + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(fh); ), @@ -1416,7 +1416,7 @@ TRACE_EVENT(nfs_initiate_write, __entry->count = hdr->args.count; __entry->stable = hdr->args.stable; __entry->dev = inode->i_sb->s_dev; - __entry->fileid = nfsi->fileid; + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(fh); ), @@ -1467,7 +1467,7 @@ TRACE_EVENT(nfs_writeback_done, &verf->verifier, NFS4_VERIFIER_SIZE); __entry->dev = inode->i_sb->s_dev; - __entry->fileid = nfsi->fileid; + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(fh); ), @@ -1507,7 +1507,7 @@ DECLARE_EVENT_CLASS(nfs_page_class, const struct nfs_inode *nfsi = NFS_I(inode); __entry->dev = inode->i_sb->s_dev; - __entry->fileid = nfsi->fileid; + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->req = req; __entry->offset = req_offset(req); @@ -1555,7 +1555,7 @@ DECLARE_EVENT_CLASS(nfs_page_error_class, TP_fast_assign( const struct nfs_inode *nfsi = NFS_I(inode); __entry->dev = inode->i_sb->s_dev; - __entry->fileid = nfsi->fileid; + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->offset = req_offset(req); __entry->count = req->wb_bytes; @@ -1609,7 +1609,7 @@ TRACE_EVENT(nfs_initiate_commit, __entry->offset = data->args.offset; __entry->count = data->args.count; __entry->dev = inode->i_sb->s_dev; - __entry->fileid = nfsi->fileid; + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(fh); ), @@ -1655,7 +1655,7 @@ TRACE_EVENT(nfs_commit_done, &verf->verifier, NFS4_VERIFIER_SIZE); __entry->dev = inode->i_sb->s_dev; - __entry->fileid = nfsi->fileid; + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(fh); ), @@ -1701,7 +1701,7 @@ DECLARE_EVENT_CLASS(nfs_direct_req_class, const struct nfs_fh *fh = &nfsi->fh; __entry->dev = inode->i_sb->s_dev; - __entry->fileid = nfsi->fileid; + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(fh); __entry->offset = dreq->io_start; __entry->count = dreq->count; @@ -1765,7 +1765,7 @@ DECLARE_EVENT_CLASS(nfs_local_dio_class, const struct nfs_fh *fh = &nfsi->fh; __entry->dev = inode->i_sb->s_dev; - __entry->fileid = nfsi->fileid; + __entry->fileid = inode->i_ino; __entry->fhandle = nfs_fhandle_hash(fh); __entry->offset = offset; __entry->count = count; diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 4a87b2fdb2e6..7dd478ffc2fa 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -759,7 +759,7 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, dprintk("NFS: initiated pgio call " "(req %s/%llu, %u bytes @ offset %llu)\n", hdr->inode->i_sb->s_id, - (unsigned long long)NFS_FILEID(hdr->inode), + (unsigned long long)hdr->inode->i_ino, hdr->args.count, (unsigned long long)hdr->args.offset); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 743467e9ba20..7715e2bd5871 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1463,8 +1463,6 @@ _pnfs_return_layout(struct inode *ino) pnfs_clear_layoutcommit(ino, &tmp_list); pnfs_mark_matching_lsegs_return(lo, &tmp_list, &range, 0); - if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) - NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range); /* Don't send a LAYOUTRETURN if list was initially empty */ if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) || @@ -1476,6 +1474,8 @@ _pnfs_return_layout(struct inode *ino) send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, NULL); spin_unlock(&ino->i_lock); + if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) + NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range); if (send) status = pnfs_send_layoutreturn(lo, &stateid, &cred, IOMODE_ANY, 0); @@ -2229,11 +2229,11 @@ lookup_again: dprintk("%s wait for layoutreturn\n", __func__); lseg = ERR_PTR(pnfs_prepare_to_retry_layoutget(lo)); if (!IS_ERR(lseg)) { - pnfs_put_layout_hdr(lo); dprintk("%s retrying\n", __func__); trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY); + pnfs_put_layout_hdr(lo); goto lookup_again; } trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, @@ -2373,7 +2373,7 @@ out: dprintk("%s: inode %s/%llu pNFS layout segment %s for " "(%s, offset: %llu, length: %llu)\n", __func__, ino->i_sb->s_id, - (unsigned long long)NFS_FILEID(ino), + (unsigned long long)ino->i_ino, IS_ERR_OR_NULL(lseg) ? "not found" : "found", iomode==IOMODE_RW ? "read/write" : "read-only", (unsigned long long)pos, diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 12632a706da8..0ff43dbcb7cd 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -1075,14 +1075,14 @@ nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags) /* r_netid */ nlen = xdr_stream_decode_string_dup(xdr, &netid, XDR_MAX_NETOBJ, gfp_flags); - if (unlikely(nlen < 0)) + if (unlikely(nlen <= 0)) goto out_err; /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */ /* port is ".ABC.DEF", 8 chars max */ rlen = xdr_stream_decode_string_dup(xdr, &buf, INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8, gfp_flags); - if (unlikely(rlen < 0)) + if (unlikely(rlen <= 0)) goto out_free_netid; /* replace port '.' with '-' */ diff --git a/fs/nfs/read.c b/fs/nfs/read.c index e1fe78d7b8d0..2b70bd2b934b 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -132,10 +132,32 @@ static void nfs_readpage_release(struct nfs_page *req, int error) static void nfs_page_group_set_uptodate(struct nfs_page *req) { - if (nfs_page_group_sync_on_bit(req, PG_UPTODATE)) + bool uptodate = false; + + nfs_page_group_lock(req); + if (!test_bit(PG_READ_FAILED, &req->wb_head->wb_flags) && + nfs_page_group_sync_on_bit_locked(req, PG_UPTODATE)) + uptodate = true; + nfs_page_group_unlock(req); + + if (uptodate) folio_mark_uptodate(nfs_page_to_folio(req)); } +static void nfs_page_group_mark_read_failed(struct nfs_page *req) +{ + struct nfs_page *tmp; + + nfs_page_group_lock(req); + set_bit(PG_READ_FAILED, &req->wb_head->wb_flags); + tmp = req; + do { + clear_bit(PG_UPTODATE, &tmp->wb_flags); + tmp = tmp->wb_this_page; + } while (tmp != req); + nfs_page_group_unlock(req); +} + static void nfs_read_completion(struct nfs_pgio_header *hdr) { unsigned long bytes = 0; @@ -172,6 +194,7 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr) if (bytes <= hdr->good_bytes) nfs_page_group_set_uptodate(req); else { + nfs_page_group_mark_read_failed(req); error = hdr->error; xchg(&nfs_req_openctx(req)->error, error); } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 8f8a03a68d3d..cb19f1540d98 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -509,6 +509,10 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, default: break; } + if (clp->cl_xprtsec.cert_serial) + seq_puts(m, ",cert_serial=<redacted>"); + if (clp->cl_xprtsec.privkey_serial) + seq_puts(m, ",privkey_serial=<redacted>"); if (version != 4) nfs_show_mountd_options(m, nfss, showdefaults); diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 43ea897943c0..b57cfaa4d516 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -460,7 +460,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) if (dentry->d_flags & DCACHE_NFSFS_RENAMED) goto out; - fileid = NFS_FILEID(d_inode(dentry)); + fileid = d_inode(dentry)->i_ino; sdentry = NULL; do { diff --git a/fs/nfs/write.c b/fs/nfs/write.c index d7c399763ad9..fcffb8c9e9df 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1817,7 +1817,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) dprintk("NFS: commit (%s/%llu %d@%lld)", nfs_req_openctx(req)->dentry->d_sb->s_id, - (unsigned long long)NFS_FILEID(d_inode(nfs_req_openctx(req)->dentry)), + (unsigned long long)d_inode(nfs_req_openctx(req)->dentry)->i_ino, req->wb_bytes, (long long)req_offset(req)); if (status < 0) { diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 4623262da3c0..ec17e602c979 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -146,11 +146,6 @@ struct nfs4_xattr_cache; */ struct nfs_inode { /* - * The 64bit 'inode number' - */ - __u64 fileid; - - /* * NFS file handle */ struct nfs_fh fh; @@ -394,16 +389,6 @@ static inline int NFS_STALE(const struct inode *inode) return test_bit(NFS_INO_STALE, &NFS_I(inode)->flags); } -static inline __u64 NFS_FILEID(const struct inode *inode) -{ - return NFS_I(inode)->fileid; -} - -static inline void set_nfs_fileid(struct inode *inode, __u64 fileid) -{ - NFS_I(inode)->fileid = fileid; -} - static inline void nfs_mark_for_revalidate(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); @@ -473,7 +458,6 @@ extern void nfs_file_set_open_context(struct file *filp, struct nfs_open_context extern void nfs_file_clear_open_context(struct file *flip); extern struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx); extern void nfs_put_lock_context(struct nfs_lock_context *l_ctx); -extern u64 nfs_compat_user_ino64(u64 fileid); extern void nfs_fattr_init(struct nfs_fattr *fattr); extern void nfs_fattr_set_barrier(struct nfs_fattr *fattr); extern unsigned long nfs_inc_attr_generation_counter(void); @@ -668,15 +652,6 @@ static inline loff_t nfs_size_to_loff_t(__u64 size) return min_t(u64, size, OFFSET_MAX); } -static inline ino_t -nfs_fileid_to_ino_t(u64 fileid) -{ - ino_t ino = (ino_t) fileid; - if (sizeof(ino_t) < sizeof(u64)) - ino ^= fileid >> (sizeof(u64)-sizeof(ino_t)) * 8; - return ino; -} - static inline void nfs_ooo_clear(struct nfs_inode *nfsi) { nfsi->cache_validity &= ~NFS_INO_DATA_INVAL_DEFER; diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index afe1d8f09d89..4b9a35dbc062 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -33,6 +33,7 @@ enum { PG_TEARDOWN, /* page group sync for destroy */ PG_UNLOCKPAGE, /* page group sync bit in read path */ PG_UPTODATE, /* page group sync bit in read path */ + PG_READ_FAILED, /* page group saw a read error */ PG_WB_END, /* page group sync bit in write path */ PG_REMOVE, /* page group sync bit in write path */ PG_CONTENDED1, /* Is someone waiting for a lock? */ diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 35ea18a40b66..11c5b31cfc7d 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -582,7 +582,6 @@ struct nfs_lock_args { struct nfs_lowner lock_owner; unsigned char block : 1; unsigned char reclaim : 1; - unsigned char new_lock : 1; unsigned char new_lock_owner : 1; }; diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index a90480f80154..e638b92b7ad1 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -327,7 +327,7 @@ static ssize_t rpc_sysfs_xprt_switch_add_xprt_store(struct kobject *kobj, { struct rpc_xprt_switch *xprt_switch = rpc_sysfs_xprt_switch_kobj_get_xprt(kobj); - struct xprt_create xprt_create_args; + struct xprt_create xprt_create_args = {}; struct rpc_xprt *xprt, *new; if (!xprt_switch) @@ -348,7 +348,7 @@ static ssize_t rpc_sysfs_xprt_switch_add_xprt_store(struct kobject *kobj, xprt_create_args.reconnect_timeout = xprt->max_reconnect_timeout; new = xprt_create_transport(&xprt_create_args); - if (IS_ERR_OR_NULL(new)) { + if (IS_ERR(new)) { count = PTR_ERR(new); goto out_put_xprt; } diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 2f0f9618dd05..e5b3463da25f 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -159,9 +159,7 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst) rpcrdma_rep_put(&r_xprt->rx_buf, rep); req->rl_reply = NULL; - spin_lock(&xprt->bc_pa_lock); - list_add_tail(&rqst->rq_bc_pa_list, &xprt->bc_pa_list); - spin_unlock(&xprt->bc_pa_lock); + rpcrdma_req_put(req); xprt_put(xprt); } @@ -203,6 +201,7 @@ create_req: rqst->rq_xprt = xprt; __set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); xdr_buf_init(&rqst->rq_snd_buf, rdmab_data(req->rl_sendbuf), size); + kref_init(&req->rl_kref); return rqst; } diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 7f79a0a2601e..e5c71cf705a3 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -474,7 +474,7 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) ++num_wrs; } - if ((kref_read(&req->rl_kref) > 1) || num_wrs > ep->re_send_count) { + if (req->rl_sendctx->sc_unmap_count || num_wrs > ep->re_send_count) { send_wr->send_flags |= IB_SEND_SIGNALED; ep->re_send_count = min_t(unsigned int, ep->re_send_batch, num_wrs - ep->re_send_count); diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 0e0f21974710..1285f04cdac1 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -467,29 +467,11 @@ static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, return 0; } -static void rpcrdma_sendctx_done(struct kref *kref) -{ - struct rpcrdma_req *req = - container_of(kref, struct rpcrdma_req, rl_kref); - struct rpcrdma_rep *rep = req->rl_reply; - - rpcrdma_complete_rqst(rep); - rep->rr_rxprt->rx_stats.reply_waits_for_send++; -} - -/** - * rpcrdma_sendctx_unmap - DMA-unmap Send buffer - * @sc: sendctx containing SGEs to unmap - * - */ -void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc) +static void rpcrdma_sendctx_dma_unmap(struct rpcrdma_sendctx *sc) { struct rpcrdma_regbuf *rb = sc->sc_req->rl_sendbuf; struct ib_sge *sge; - if (!sc->sc_unmap_count) - return; - /* The first two SGEs contain the transport header and * the inline buffer. These are always left mapped so * they can be cheaply re-used. @@ -498,8 +480,33 @@ void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc) ++sge, --sc->sc_unmap_count) ib_dma_unmap_page(rdmab_device(rb), sge->addr, sge->length, DMA_TO_DEVICE); +} + +/** + * rpcrdma_sendctx_unmap - DMA-unmap Send buffer and release Send owner + * @sc: sendctx containing SGEs to unmap + * + */ +void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc) +{ + struct rpcrdma_req *req = sc->sc_req; - kref_put(&sc->sc_req->rl_kref, rpcrdma_sendctx_done); + rpcrdma_sendctx_dma_unmap(sc); + sc->sc_req = NULL; + req->rl_sendctx = NULL; + rpcrdma_req_put(req); +} + +/* No Send was posted. Release DMA mappings prepared for this + * sendctx, but leave the request reference count alone. + */ +static void rpcrdma_sendctx_cancel(struct rpcrdma_sendctx *sc) +{ + struct rpcrdma_req *req = sc->sc_req; + + rpcrdma_sendctx_dma_unmap(sc); + sc->sc_req = NULL; + req->rl_sendctx = NULL; } /* Prepare an SGE for the RPC-over-RDMA transport header. @@ -691,8 +698,6 @@ static bool rpcrdma_prepare_noch_mapped(struct rpcrdma_xprt *r_xprt, tail->iov_len)) return false; - if (req->rl_sendctx->sc_unmap_count) - kref_get(&req->rl_kref); return true; } @@ -722,7 +727,6 @@ static bool rpcrdma_prepare_readch(struct rpcrdma_xprt *r_xprt, len -= len & 3; if (!rpcrdma_prepare_tail_iov(req, xdr, page_base, len)) return false; - kref_get(&req->rl_kref); } return true; @@ -743,6 +747,7 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdr, enum rpcrdma_chunktype rtype) { + struct rpcrdma_sendctx *sc; int ret; ret = -EAGAIN; @@ -751,7 +756,6 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, goto out_nosc; req->rl_sendctx->sc_unmap_count = 0; req->rl_sendctx->sc_req = req; - kref_init(&req->rl_kref); req->rl_wr.wr_cqe = &req->rl_sendctx->sc_cqe; req->rl_wr.sg_list = req->rl_sendctx->sc_sges; req->rl_wr.num_sge = 0; @@ -779,10 +783,16 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, goto out_unmap; } + /* The Send-side owner releases this reference when the + * Send has completed. + */ + kref_get(&req->rl_kref); return 0; out_unmap: - rpcrdma_sendctx_unmap(req->rl_sendctx); + sc = req->rl_sendctx; + rpcrdma_sendctx_cancel(sc); + rpcrdma_sendctx_unget_locked(r_xprt, sc); out_nosc: trace_xprtrdma_prepsend_failed(&req->rl_slot, ret); return ret; @@ -1081,6 +1091,8 @@ rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) /* Peek at stream contents without advancing. */ p = xdr_inline_decode(xdr, 0); + if ((char *)xdr->end - (char *)p < 5 * XDR_UNIT) + return false; /* Chunk lists */ if (xdr_item_is_present(p++)) @@ -1105,7 +1117,7 @@ rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) */ p = xdr_inline_decode(xdr, 3 * sizeof(*p)); if (unlikely(!p)) - return true; + return false; rpcrdma_bc_receive_call(r_xprt, rep); return true; @@ -1329,6 +1341,11 @@ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) struct rpc_rqst *rqst = rep->rr_rqst; int status; + /* I3: rl_registered has been drained by frwr_unmap before + * complete_rqst runs. + */ + WARN_ON_ONCE(!list_empty(&rpcr_to_rdmar(rqst)->rl_registered)); + switch (rep->rr_proc) { case rdma_msg: status = rpcrdma_decode_msg(r_xprt, rep, rqst); @@ -1360,13 +1377,69 @@ out_badheader: goto out; } -static void rpcrdma_reply_done(struct kref *kref) -{ - struct rpcrdma_req *req = - container_of(kref, struct rpcrdma_req, rl_kref); - - rpcrdma_complete_rqst(req->rl_reply); -} +/* Reply-side ownership invariants + * + * I1 (Receive WR ownership). A struct rpcrdma_rep is owned by the + * HCA between ib_post_recv() and the matching Receive completion. + * After ib_dma_sync_single_for_cpu() in rpcrdma_wc_receive() it is + * owned by the CPU until rpcrdma_rep_put() returns it to + * rb_free_reps; a rep on rb_free_reps is not re-posted until + * rpcrdma_post_recvs() pulls it off. Asserted: rpcrdma_post_recvs() + * WARNs that a pulled rep has rr_rqst == NULL. + * + * I2 (rep attachment). While req->rl_reply == rep, the rep cannot be + * re-posted. rpcrdma_reply_put() NULLs req->rl_reply before handing + * the rep to rpcrdma_rep_put(). Asserted: rpcrdma_reply_put() WARNs + * that rl_reply is NULL after the put. + * + * I3 (Registered-MR fence). On entry to rpcrdma_complete_rqst() every + * MR that was on req->rl_registered has had its rkey invalidated + * (remotely via IB_WC_WITH_INVALIDATE or locally via IB_WR_LOCAL_INV) + * and its pages ib_dma_unmap_sg()'d. The LocalInv chain is posted + * on a single QP; strong send-queue ordering makes the last + * completion (frwr_wc_localinv_done) observe the + * ib_dma_unmap_sg() that ran from each earlier completion's + * frwr_mr_put() before complete_rqst is called. The inline + * frwr_reminv() path unmaps its one MR synchronously before + * rpcrdma_reply_handler() reaches complete_rqst. Asserted: + * rpcrdma_complete_rqst() WARNs that rl_registered is empty. + * + * I4 (Send-buffer release). req->rl_kref carries two unconditional + * owners while a Send is outstanding: the RPC-layer reference (set + * at xprt_rdma_alloc_slot / xprt_rdma_bc_rqst_get / rpcrdma_req_release + * pool-entry) and the Send-side reference (kref_get() in + * rpcrdma_prepare_send_sges()). rpcrdma_req_release() runs only + * after both have dropped, so the req does not return to its free + * pool until rpcrdma_sendctx_unmap() has fired -- the HCA has + * released the send buffer before the req can be reused. Asserted: + * rpcrdma_req_release() WARNs that rl_sendctx is NULL. + * + * I5 (req lifecycle). A req is owned by the RPC layer between slot + * acquisition and the matching xprt_rdma_free_slot() (or, for the + * backchannel, xprt_rdma_bc_free_rqst()). While owned, rl_kref >= 1. + * The pools (rb_send_bufs, bc_pa_list, backlog wake target) never + * contain a req with outstanding Send-side or Reply-side work. + * + * Non-hazards. The following claims have been raised by adversarial + * review and are each closed by the invariants above: + * + * * "Reply completes the RPC while the HCA still holds the send + * buffer" -- excluded by I4. The Send-side kref reference is held + * until rpcrdma_sendctx_unmap() runs from Send completion. + * + * * "Signal-driven release races the in-flight Send" -- same + * resolution. xprt_rdma_free() does not touch rl_kref; the + * Send-side reference keeps the req out of its pool until Send + * completion fires. + * + * * "Receive completion races rep reuse" -- excluded by I1. A rep + * is on rb_free_reps only after rpcrdma_rep_put() has been called + * and rpcrdma_post_recvs() owns the next transition back to the HCA. + * + * * "Pages still DMA-mapped when call_decode reads them" -- excluded + * by I3. The matching ib_dma_unmap_sg() for every MR has run on + * the same CPU thread that calls rpcrdma_complete_rqst(). + */ /** * rpcrdma_reply_handler - Process received RPC/RDMA messages @@ -1402,6 +1475,14 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) credits = be32_to_cpu(*p++); rep->rr_proc = *p++; + /* The credit grant from the wire is not trustworthy; + * sanitize it before any code path consumes it. + */ + if (credits == 0) + credits = 1; /* don't deadlock */ + else if (credits > r_xprt->rx_ep->re_max_requests) + credits = r_xprt->rx_ep->re_max_requests; + if (rep->rr_vers != rpcrdma_version) goto out_badversion; @@ -1418,10 +1499,6 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) xprt_pin_rqst(rqst); spin_unlock(&xprt->queue_lock); - if (credits == 0) - credits = 1; /* don't deadlock */ - else if (credits > r_xprt->rx_ep->re_max_requests) - credits = r_xprt->rx_ep->re_max_requests; if (buf->rb_credits != credits) rpcrdma_update_cwnd(r_xprt, credits); @@ -1439,7 +1516,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) frwr_unmap_async(r_xprt, req); /* LocalInv completion will complete the RPC */ else - kref_put(&req->rl_kref, rpcrdma_reply_done); + rpcrdma_complete_rqst(rep); out_post: rpcrdma_post_recvs(r_xprt, @@ -1454,11 +1531,13 @@ out_norqst: out_badversion: trace_xprtrdma_reply_vers_err(rep); - goto out; + rpcrdma_rep_put(buf, rep); + credits = buf->rb_credits; + goto out_post; out_shortreply: trace_xprtrdma_reply_short_err(rep); - -out: rpcrdma_rep_put(buf, rep); + credits = buf->rb_credits; + goto out_post; } diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 61706df5e485..d4e6746d8ecd 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -279,6 +279,13 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) cancel_delayed_work_sync(&r_xprt->rx_connect_worker); rpcrdma_xprt_disconnect(r_xprt); + + /* The disconnect's sendctx drain can return bc_prealloc reqs + * to bc_pa_list after xprt_destroy_backchannel() emptied it. + */ +#if defined(CONFIG_SUNRPC_BACKCHANNEL) + xprt_rdma_bc_destroy(xprt, 0); +#endif rpcrdma_buffer_destroy(&r_xprt->rx_buf); xprt_rdma_free_addresses(xprt); @@ -484,7 +491,52 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) xprt_reconnect_backoff(xprt, RPCRDMA_INIT_REEST_TO); } trace_xprtrdma_op_connect(r_xprt, delay); - queue_delayed_work(system_long_wq, &r_xprt->rx_connect_worker, delay); + queue_delayed_work(system_dfl_long_wq, &r_xprt->rx_connect_worker, + delay); +} + +/* rl_kref has two owners while a Send is outstanding: the rpc_rqst + * owner and the sendctx. Replies complete the RPC but do not drop + * either reference. The req returns to its free pool only after + * xprt_rdma_free_slot() or xprt_rdma_bc_free_rqst() has dropped the + * RPC-layer reference and rpcrdma_sendctx_unmap() has dropped the + * Send-side reference. + */ +static void rpcrdma_req_release(struct kref *kref) +{ + struct rpcrdma_req *req = + container_of(kref, struct rpcrdma_req, rl_kref); + struct rpc_rqst *rqst = &req->rl_slot; + struct rpc_xprt *xprt = rqst->rq_xprt; + struct rpcrdma_xprt *r_xprt; + + /* I4: both the RPC-layer and Send-side owners have dropped, + * so rpcrdma_sendctx_unmap() has cleared rl_sendctx. + */ + WARN_ON_ONCE(req->rl_sendctx); + + kref_init(&req->rl_kref); + +#if defined(CONFIG_SUNRPC_BACKCHANNEL) + if (bc_prealloc(rqst)) { + spin_lock(&xprt->bc_pa_lock); + list_add_tail(&rqst->rq_bc_pa_list, &xprt->bc_pa_list); + spin_unlock(&xprt->bc_pa_lock); + return; + } +#endif + + if (xprt_wake_up_backlog(xprt, rqst)) + return; + + r_xprt = rpcx_to_rdmax(xprt); + memset(rqst, 0, sizeof(*rqst)); + rpcrdma_buffer_put(&r_xprt->rx_buf, req); +} + +void rpcrdma_req_put(struct rpcrdma_req *req) +{ + kref_put(&req->rl_kref, rpcrdma_req_release); } /** @@ -505,6 +557,7 @@ xprt_rdma_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) req = rpcrdma_buffer_get(&r_xprt->rx_buf); if (!req) goto out_sleep; + kref_init(&req->rl_kref); task->tk_rqstp = &req->rl_slot; task->tk_status = 0; return; @@ -520,6 +573,7 @@ out_sleep: if (req) { struct rpc_rqst *rqst = &req->rl_slot; + kref_init(&req->rl_kref); if (!xprt_wake_up_backlog(xprt, rqst)) { memset(rqst, 0, sizeof(*rqst)); rpcrdma_buffer_put(&r_xprt->rx_buf, req); @@ -540,10 +594,7 @@ xprt_rdma_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *rqst) container_of(xprt, struct rpcrdma_xprt, rx_xprt); rpcrdma_reply_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst)); - if (!xprt_wake_up_backlog(xprt, rqst)) { - memset(rqst, 0, sizeof(*rqst)); - rpcrdma_buffer_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst)); - } + rpcrdma_req_put(rpcr_to_rdmar(rqst)); } static bool rpcrdma_check_regbuf(struct rpcrdma_xprt *r_xprt, @@ -607,10 +658,10 @@ xprt_rdma_free(struct rpc_task *task) frwr_unmap_sync(rpcx_to_rdmax(rqst->rq_xprt), req); } - /* XXX: If the RPC is completing because of a signal and - * not because a reply was received, we ought to ensure - * that the Send completion has fired, so that memory - * involved with the Send is not still visible to the NIC. + /* The Send-side rl_kref owner keeps req out of its free pool + * until rpcrdma_sendctx_unmap() has fired -- see I4 above + * rpcrdma_reply_handler() -- so signal-driven release here + * does not let the HCA touch a recycled send buffer. */ } @@ -716,7 +767,7 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) r_xprt->rx_stats.mrs_allocated, r_xprt->rx_stats.local_inv_needed, r_xprt->rx_stats.empty_sendctx_q, - r_xprt->rx_stats.reply_waits_for_send); + 0LU); /* was reply_waits_for_send; column preserved */ } static int diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index aecf9c0a153f..04b286223b24 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -65,6 +65,8 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt); static void rpcrdma_sendctxs_destroy(struct rpcrdma_xprt *r_xprt); +static unsigned long rpcrdma_sendctx_next(struct rpcrdma_buffer *buf, + unsigned long item); static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt, struct rpcrdma_sendctx *sc); static int rpcrdma_reqs_setup(struct rpcrdma_xprt *r_xprt); @@ -79,6 +81,8 @@ rpcrdma_regbuf_alloc_node(size_t size, enum dma_data_direction direction, int node); static struct rpcrdma_regbuf * rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction); +static bool rpcrdma_regbuf_realloc_node(struct rpcrdma_regbuf *rb, + size_t size, gfp_t flags, int node); static void rpcrdma_regbuf_dma_unmap(struct rpcrdma_regbuf *rb); static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb); @@ -243,8 +247,17 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) complete(&ep->re_done); return 0; case RDMA_CM_EVENT_ADDR_CHANGE: - ep->re_connect_status = -ENODEV; - goto disconnected; + switch (xchg(&ep->re_connect_status, -ENODEV)) { + case 0: + goto wake_connect_worker; + case 1: + /* The later DISCONNECTED event balances the + * ESTABLISHED get; do not put here. + */ + rpcrdma_force_disconnect(ep); + return 0; + } + return 0; case RDMA_CM_EVENT_ESTABLISHED: rpcrdma_ep_get(ep); ep->re_connect_status = 1; @@ -267,7 +280,6 @@ wake_connect_worker: return 0; case RDMA_CM_EVENT_DISCONNECTED: ep->re_connect_status = -ECONNABORTED; -disconnected: rpcrdma_force_disconnect(ep); return rpcrdma_ep_put(ep); default: @@ -324,6 +336,7 @@ static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt, if (rc) goto out; + ep->re_id = id; rc = rpcrdma_rn_register(id->device, &ep->re_rn, rpcrdma_ep_removal_done); if (rc) goto out; @@ -396,7 +409,6 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) } __module_get(THIS_MODULE); device = id->device; - ep->re_id = id; reinit_completion(&ep->re_done); ep->re_max_requests = r_xprt->rx_xprt.max_reqs; @@ -539,7 +551,17 @@ int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt) goto out; } rpcrdma_mrs_create(r_xprt); - frwr_wp_create(r_xprt); + + /* + * rpcrdma_encode_write_list() dereferences the write-pad + * MR with no NULL check, so fail the connect rather than + * publish a transport whose write-pad MR is NULL. + */ + rc = frwr_wp_create(r_xprt); + if (rc) { + rc = -ENOTCONN; + goto out; + } out: trace_xprtrdma_connect(r_xprt, rc); @@ -571,9 +593,9 @@ void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt) rpcrdma_xprt_drain(r_xprt); rpcrdma_reps_unmap(r_xprt); + rpcrdma_sendctxs_destroy(r_xprt); rpcrdma_reqs_reset(r_xprt); rpcrdma_mrs_destroy(r_xprt); - rpcrdma_sendctxs_destroy(r_xprt); if (rpcrdma_ep_put(ep)) rdma_destroy_id(id); @@ -605,6 +627,25 @@ static void rpcrdma_sendctxs_destroy(struct rpcrdma_xprt *r_xprt) if (!buf->rb_sc_ctxs) return; + + /* The QP is drained, but the final unsignaled Sends might not + * have been walked by a signaled Send completion. Release those + * Send owners before request buffers are reset. + * + * Unlike the completion sweep, this walk can visit slots with + * no Send posted: after a partial rpcrdma_sendctxs_create() + * failure on reconnect, rb_sc_head and rb_sc_tail are stale, + * and slots between them can be NULL or have sc_req clear. + */ + for (i = rpcrdma_sendctx_next(buf, buf->rb_sc_tail); + i != rpcrdma_sendctx_next(buf, buf->rb_sc_head); + i = rpcrdma_sendctx_next(buf, i)) { + struct rpcrdma_sendctx *sc = buf->rb_sc_ctxs[i]; + + if (sc && sc->sc_req) + rpcrdma_sendctx_unmap(sc); + } + for (i = 0; i <= buf->rb_sc_last; i++) kfree(buf->rb_sc_ctxs[i]); kfree(buf->rb_sc_ctxs); @@ -667,6 +708,12 @@ static unsigned long rpcrdma_sendctx_next(struct rpcrdma_buffer *buf, return likely(item < buf->rb_sc_last) ? item + 1 : 0; } +static unsigned long rpcrdma_sendctx_prev(struct rpcrdma_buffer *buf, + unsigned long item) +{ + return item > 0 ? item - 1 : buf->rb_sc_last; +} + /** * rpcrdma_sendctx_get_locked - Acquire a send context * @r_xprt: controlling transport instance @@ -724,6 +771,29 @@ out_emptyq: } /** + * rpcrdma_sendctx_unget_locked - Release an unposted send context + * @r_xprt: controlling transport instance + * @sc: send context to release + * + * Usage: Called when no Send is posted for the sendctx most + * recently returned by rpcrdma_sendctx_get_locked(). + * + * The caller serializes calls to this function and to + * rpcrdma_sendctx_get_locked() (per transport). + */ +void rpcrdma_sendctx_unget_locked(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_sendctx *sc) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + + if (WARN_ON_ONCE(buf->rb_sc_ctxs[buf->rb_sc_head] != sc)) + return; + + buf->rb_sc_head = rpcrdma_sendctx_prev(buf, buf->rb_sc_head); + xprt_write_space(&r_xprt->rx_xprt); +} + +/** * rpcrdma_sendctx_put_locked - Release a send context * @r_xprt: controlling transport instance * @sc: send context to release @@ -739,15 +809,18 @@ static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt, struct rpcrdma_buffer *buf = &r_xprt->rx_buf; unsigned long next_tail; - /* Unmap SGEs of previously completed but unsignaled - * Sends by walking up the queue until @sc is found. + /* Release previously completed but unsignaled Sends by walking + * up the queue until @sc is found. */ next_tail = buf->rb_sc_tail; do { + struct rpcrdma_sendctx *cur; + next_tail = rpcrdma_sendctx_next(buf, next_tail); /* ORDER: item must be accessed _before_ tail is updated */ - rpcrdma_sendctx_unmap(buf->rb_sc_ctxs[next_tail]); + cur = buf->rb_sc_ctxs[next_tail]; + rpcrdma_sendctx_unmap(cur); } while (buf->rb_sc_ctxs[next_tail] != sc); @@ -1022,9 +1095,15 @@ static struct rpcrdma_rep *rpcrdma_rep_get_locked(struct rpcrdma_buffer *buf) * @buf: buffer pool * @rep: rep to release * + * The rep's transient association with an rpc_rqst, established + * by rpcrdma_reply_handler() and torn down here, must not survive + * onto rb_free_reps: rpcrdma_post_recvs() pulls reps from the free + * list to re-post them, and a non-NULL rr_rqst on a free-listed rep + * would imply the rep is still referenced by a req. */ void rpcrdma_rep_put(struct rpcrdma_buffer *buf, struct rpcrdma_rep *rep) { + rep->rr_rqst = NULL; llist_add(&rep->rr_node, &buf->rb_free_reps); } @@ -1059,6 +1138,22 @@ static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf) spin_unlock(&buf->rb_lock); } +static unsigned int rpcrdma_req_pool_slack(unsigned int max_reqs) +{ + /* The sendctx ring can hold up to one Send-signaling batch + * (re_send_batch, set by frwr_open() to re_max_requests >> 3) + * of unfinished Sends. Each pins its req until a signaled Send + * completion releases the sendctx. Size the pool above max_reqs + * by that batch so the recycle delay does not stall a slot + * allocation that the RPC/RDMA credit window would admit. + * + * Round up: re_max_requests >> 3 is zero when max_reqs < 8, but + * a single unsignaled Send is still enough to pin one req. One + * slack slot covers that case. + */ + return DIV_ROUND_UP(max_reqs, 8); +} + /** * rpcrdma_buffer_create - Create initial set of req/rep objects * @r_xprt: transport instance to (re)initialize @@ -1068,6 +1163,7 @@ static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf) int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + unsigned int max_reqs; int i, rc; buf->rb_bc_srv_max_requests = 0; @@ -1076,19 +1172,21 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) INIT_LIST_HEAD(&buf->rb_all_mrs); INIT_WORK(&buf->rb_refresh_worker, rpcrdma_mr_refresh_worker); - INIT_LIST_HEAD(&buf->rb_send_bufs); + init_llist_head(&buf->rb_send_bufs); INIT_LIST_HEAD(&buf->rb_allreqs); INIT_LIST_HEAD(&buf->rb_all_reps); rc = -ENOMEM; - for (i = 0; i < r_xprt->rx_xprt.max_reqs; i++) { + max_reqs = r_xprt->rx_xprt.max_reqs; + max_reqs += rpcrdma_req_pool_slack(max_reqs); + for (i = 0; i < max_reqs; i++) { struct rpcrdma_req *req; req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE * 2); if (!req) goto out; - list_add(&req->rl_list, &buf->rb_send_bufs); + llist_add(&req->rl_node, &buf->rb_send_bufs); } init_llist_head(&buf->rb_free_reps); @@ -1168,16 +1266,14 @@ static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt) void rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { - rpcrdma_reps_destroy(buf); + struct rpcrdma_req *req, *next; + struct llist_node *node; - while (!list_empty(&buf->rb_send_bufs)) { - struct rpcrdma_req *req; + rpcrdma_reps_destroy(buf); - req = list_first_entry(&buf->rb_send_bufs, - struct rpcrdma_req, rl_list); - list_del(&req->rl_list); + node = llist_del_all(&buf->rb_send_bufs); + llist_for_each_entry_safe(req, next, node, rl_node) rpcrdma_req_destroy(req); - } } /** @@ -1207,9 +1303,11 @@ rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt) */ void rpcrdma_reply_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) { - if (req->rl_reply) { - rpcrdma_rep_put(buffers, req->rl_reply); + struct rpcrdma_rep *rep = req->rl_reply; + + if (rep) { req->rl_reply = NULL; + rpcrdma_rep_put(buffers, rep); } } @@ -1222,15 +1320,15 @@ void rpcrdma_reply_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) struct rpcrdma_req * rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) { - struct rpcrdma_req *req; + struct llist_node *node; + /* Calls to llist_del_first are required to be serialized */ spin_lock(&buffers->rb_lock); - req = list_first_entry_or_null(&buffers->rb_send_bufs, - struct rpcrdma_req, rl_list); - if (req) - list_del_init(&req->rl_list); + node = llist_del_first(&buffers->rb_send_bufs); spin_unlock(&buffers->rb_lock); - return req; + if (!node) + return NULL; + return llist_entry(node, struct rpcrdma_req, rl_node); } /** @@ -1243,9 +1341,7 @@ void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) { rpcrdma_reply_put(buffers, req); - spin_lock(&buffers->rb_lock); - list_add(&req->rl_list, &buffers->rb_send_bufs); - spin_unlock(&buffers->rb_lock); + llist_add(&req->rl_node, &buffers->rb_send_bufs); } /* Returns a pointer to a rpcrdma_regbuf object, or NULL. @@ -1292,9 +1388,15 @@ rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction) */ bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size, gfp_t flags) { + return rpcrdma_regbuf_realloc_node(rb, size, flags, NUMA_NO_NODE); +} + +static bool rpcrdma_regbuf_realloc_node(struct rpcrdma_regbuf *rb, + size_t size, gfp_t flags, int node) +{ void *buf; - buf = kmalloc(size, flags); + buf = kmalloc_node(size, flags, node); if (!buf) return false; @@ -1306,6 +1408,23 @@ bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size, gfp_t flags) return true; } +static bool rpcrdma_rep_resize(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_rep *rep) +{ + struct rpcrdma_regbuf *rb = rep->rr_rdmabuf; + struct rpcrdma_ep *ep = r_xprt->rx_ep; + size_t size = ep->re_inline_recv; + + if (likely(rdmab_length(rb) >= size)) + return true; + if (!rpcrdma_regbuf_realloc_node(rb, size, XPRTRDMA_GFP_FLAGS, + ibdev_to_node(ep->re_id->device))) + return false; + + xdr_buf_init(&rep->rr_hdrbuf, rdmab_data(rb), rdmab_length(rb)); + return true; +} + /** * __rpcrdma_regbuf_dma_map - DMA-map a regbuf * @r_xprt: controlling transport instance @@ -1387,6 +1506,12 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed) rep = rpcrdma_rep_create(r_xprt); if (!rep) break; + /* I1: a rep on rb_free_reps must carry no rqst pointer. */ + WARN_ON_ONCE(rep->rr_rqst); + if (!rpcrdma_rep_resize(r_xprt, rep)) { + rpcrdma_rep_put(buf, rep); + break; + } if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf)) { rpcrdma_rep_put(buf, rep); break; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index f53a77472724..4cbc941e4a3e 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -332,7 +332,7 @@ enum { struct rpcrdma_buffer; struct rpcrdma_req { - struct list_head rl_list; + struct llist_node rl_node; struct rpc_rqst rl_slot; struct rpcrdma_rep *rl_reply; struct xdr_stream rl_stream; @@ -374,14 +374,14 @@ rpcrdma_mr_pop(struct list_head *list) } /* - * struct rpcrdma_buffer -- holds list/queue of pre-registered memory for - * inline requests/replies, and client/server credits. + * struct rpcrdma_buffer -- holds pre-registered memory for inline + * requests/replies, and client/server credits. * * One of these is associated with a transport instance */ struct rpcrdma_buffer { spinlock_t rb_lock; - struct list_head rb_send_bufs; + struct llist_head rb_send_bufs; struct list_head rb_mrs; unsigned long rb_sc_head; @@ -427,7 +427,6 @@ struct rpcrdma_stats { /* accessed when receiving a reply */ unsigned long long total_rdma_reply; unsigned long long fixup_copy_count; - unsigned long reply_waits_for_send; unsigned long local_inv_needed; unsigned long nomsg_call_count; unsigned long bcall_count; @@ -496,6 +495,8 @@ void rpcrdma_req_destroy(struct rpcrdma_req *req); int rpcrdma_buffer_create(struct rpcrdma_xprt *); void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt); +void rpcrdma_sendctx_unget_locked(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_sendctx *sc); struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt); void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt); @@ -505,6 +506,7 @@ void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req); void rpcrdma_rep_put(struct rpcrdma_buffer *buf, struct rpcrdma_rep *rep); void rpcrdma_reply_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req); +void rpcrdma_req_put(struct rpcrdma_req *req); bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size, gfp_t flags); |
