diff options
Diffstat (limited to 'fs')
156 files changed, 1740 insertions, 1605 deletions
diff --git a/fs/affs/namei.c b/fs/affs/namei.c index d8aa0ae3d037..41c5749f4db7 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -201,14 +201,16 @@ affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) struct super_block *sb = dir->i_sb; struct buffer_head *bh; struct inode *inode = NULL; + struct dentry *res; pr_debug("%s(\"%pd\")\n", __func__, dentry); affs_lock_dir(dir); bh = affs_find_entry(dir, dentry); - affs_unlock_dir(dir); - if (IS_ERR(bh)) + if (IS_ERR(bh)) { + affs_unlock_dir(dir); return ERR_CAST(bh); + } if (bh) { u32 ino = bh->b_blocknr; @@ -222,11 +224,12 @@ affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) } affs_brelse(bh); inode = affs_iget(sb, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); } - d_add(dentry, inode); - return NULL; + res = d_splice_alias(inode, dentry); + if (!IS_ERR_OR_NULL(res)) + res->d_fsdata = dentry->d_fsdata; + affs_unlock_dir(dir); + return res; } int diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c index 3bedfed608a2..7587fb665ff1 100644 --- a/fs/afs/addr_list.c +++ b/fs/afs/addr_list.c @@ -121,7 +121,7 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len, p = text; do { struct sockaddr_rxrpc *srx = &alist->addrs[alist->nr_addrs]; - char tdelim = delim; + const char *q, *stop; if (*p == delim) { p++; @@ -130,28 +130,33 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len, if (*p == '[') { p++; - tdelim = ']'; + q = memchr(p, ']', end - p); + } else { + for (q = p; q < end; q++) + if (*q == '+' || *q == delim) + break; } - if (in4_pton(p, end - p, + if (in4_pton(p, q - p, (u8 *)&srx->transport.sin6.sin6_addr.s6_addr32[3], - tdelim, &p)) { + -1, &stop)) { srx->transport.sin6.sin6_addr.s6_addr32[0] = 0; srx->transport.sin6.sin6_addr.s6_addr32[1] = 0; srx->transport.sin6.sin6_addr.s6_addr32[2] = htonl(0xffff); - } else if (in6_pton(p, end - p, + } else if (in6_pton(p, q - p, srx->transport.sin6.sin6_addr.s6_addr, - tdelim, &p)) { + -1, &stop)) { /* Nothing to do */ } else { goto bad_address; } - if (tdelim == ']') { - if (p == end || *p != ']') - goto bad_address; + if (stop != q) + goto bad_address; + + p = q; + if (q < end && *q == ']') p++; - } if (p < end) { if (*p == '+') { diff --git a/fs/afs/callback.c b/fs/afs/callback.c index abd9a84f4e88..571437dcb252 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -23,36 +23,55 @@ /* * Set up an interest-in-callbacks record for a volume on a server and * register it with the server. - * - Called with volume->server_sem held. + * - Called with vnode->io_lock held. */ int afs_register_server_cb_interest(struct afs_vnode *vnode, - struct afs_server_entry *entry) + struct afs_server_list *slist, + unsigned int index) { - struct afs_cb_interest *cbi = entry->cb_interest, *vcbi, *new, *x; + struct afs_server_entry *entry = &slist->servers[index]; + struct afs_cb_interest *cbi, *vcbi, *new, *old; struct afs_server *server = entry->server; again: + if (vnode->cb_interest && + likely(vnode->cb_interest == entry->cb_interest)) + return 0; + + read_lock(&slist->lock); + cbi = afs_get_cb_interest(entry->cb_interest); + read_unlock(&slist->lock); + vcbi = vnode->cb_interest; if (vcbi) { - if (vcbi == cbi) + if (vcbi == cbi) { + afs_put_cb_interest(afs_v2net(vnode), cbi); return 0; + } + /* Use a new interest in the server list for the same server + * rather than an old one that's still attached to a vnode. + */ if (cbi && vcbi->server == cbi->server) { write_seqlock(&vnode->cb_lock); - vnode->cb_interest = afs_get_cb_interest(cbi); + old = vnode->cb_interest; + vnode->cb_interest = cbi; write_sequnlock(&vnode->cb_lock); - afs_put_cb_interest(afs_v2net(vnode), cbi); + afs_put_cb_interest(afs_v2net(vnode), old); return 0; } + /* Re-use the one attached to the vnode. */ if (!cbi && vcbi->server == server) { - afs_get_cb_interest(vcbi); - x = cmpxchg(&entry->cb_interest, cbi, vcbi); - if (x != cbi) { - cbi = x; - afs_put_cb_interest(afs_v2net(vnode), vcbi); + write_lock(&slist->lock); + if (entry->cb_interest) { + write_unlock(&slist->lock); + afs_put_cb_interest(afs_v2net(vnode), cbi); goto again; } + + entry->cb_interest = cbi; + write_unlock(&slist->lock); return 0; } } @@ -72,13 +91,16 @@ again: list_add_tail(&new->cb_link, &server->cb_interests); write_unlock(&server->cb_break_lock); - x = cmpxchg(&entry->cb_interest, cbi, new); - if (x == cbi) { + write_lock(&slist->lock); + if (!entry->cb_interest) { + entry->cb_interest = afs_get_cb_interest(new); cbi = new; + new = NULL; } else { - cbi = x; - afs_put_cb_interest(afs_v2net(vnode), new); + cbi = afs_get_cb_interest(entry->cb_interest); } + write_unlock(&slist->lock); + afs_put_cb_interest(afs_v2net(vnode), new); } ASSERT(cbi); @@ -88,11 +110,14 @@ again: */ write_seqlock(&vnode->cb_lock); - vnode->cb_interest = afs_get_cb_interest(cbi); + old = vnode->cb_interest; + vnode->cb_interest = cbi; vnode->cb_s_break = cbi->server->cb_s_break; + vnode->cb_v_break = vnode->volume->cb_v_break; clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); write_sequnlock(&vnode->cb_lock); + afs_put_cb_interest(afs_v2net(vnode), old); return 0; } @@ -171,13 +196,24 @@ static void afs_break_one_callback(struct afs_server *server, if (cbi->vid != fid->vid) continue; - data.volume = NULL; - data.fid = *fid; - inode = ilookup5_nowait(cbi->sb, fid->vnode, afs_iget5_test, &data); - if (inode) { - vnode = AFS_FS_I(inode); - afs_break_callback(vnode); - iput(inode); + if (fid->vnode == 0 && fid->unique == 0) { + /* The callback break applies to an entire volume. */ + struct afs_super_info *as = AFS_FS_S(cbi->sb); + struct afs_volume *volume = as->volume; + + write_lock(&volume->cb_break_lock); + volume->cb_v_break++; + write_unlock(&volume->cb_break_lock); + } else { + data.volume = NULL; + data.fid = *fid; + inode = ilookup5_nowait(cbi->sb, fid->vnode, + afs_iget5_test, &data); + if (inode) { + vnode = AFS_FS_I(inode); + afs_break_callback(vnode); + iput(inode); + } } } @@ -195,6 +231,8 @@ void afs_break_callbacks(struct afs_server *server, size_t count, ASSERT(server != NULL); ASSERTCMP(count, <=, AFSCBMAX); + /* TODO: Sort the callback break list by volume ID */ + for (; count > 0; callbacks++, count--) { _debug("- Fid { vl=%08x n=%u u=%u } CB { v=%u x=%u t=%u }", callbacks->fid.vid, diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 357de908df3a..c332c95a6940 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -133,21 +133,10 @@ bool afs_cm_incoming_call(struct afs_call *call) } /* - * clean up a cache manager call + * Clean up a cache manager call. */ static void afs_cm_destructor(struct afs_call *call) { - _enter(""); - - /* Break the callbacks here so that we do it after the final ACK is - * received. The step number here must match the final number in - * afs_deliver_cb_callback(). - */ - if (call->unmarshall == 5) { - ASSERT(call->cm_server && call->count && call->request); - afs_break_callbacks(call->cm_server, call->count, call->request); - } - kfree(call->buffer); call->buffer = NULL; } @@ -161,14 +150,14 @@ static void SRXAFSCB_CallBack(struct work_struct *work) _enter(""); - /* be sure to send the reply *before* attempting to spam the AFS server - * with FSFetchStatus requests on the vnodes with broken callbacks lest - * the AFS server get into a vicious cycle of trying to break further - * callbacks because it hadn't received completion of the CBCallBack op - * yet */ - afs_send_empty_reply(call); + /* We need to break the callbacks before sending the reply as the + * server holds up change visibility till it receives our reply so as + * to maintain cache coherency. + */ + if (call->cm_server) + afs_break_callbacks(call->cm_server, call->count, call->request); - afs_break_callbacks(call->cm_server, call->count, call->request); + afs_send_empty_reply(call); afs_put_call(call); _leave(""); } @@ -180,7 +169,6 @@ static int afs_deliver_cb_callback(struct afs_call *call) { struct afs_callback_break *cb; struct sockaddr_rxrpc srx; - struct afs_server *server; __be32 *bp; int ret, loop; @@ -267,15 +255,6 @@ static int afs_deliver_cb_callback(struct afs_call *call) call->offset = 0; call->unmarshall++; - - /* Record that the message was unmarshalled successfully so - * that the call destructor can know do the callback breaking - * work, even if the final ACK isn't received. - * - * If the step number changes, then afs_cm_destructor() must be - * updated also. - */ - call->unmarshall++; case 5: break; } @@ -286,10 +265,9 @@ static int afs_deliver_cb_callback(struct afs_call *call) /* we'll need the file server record as that tells us which set of * vnodes to operate upon */ rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx); - server = afs_find_server(call->net, &srx); - if (!server) - return -ENOTCONN; - call->cm_server = server; + call->cm_server = afs_find_server(call->net, &srx); + if (!call->cm_server) + trace_afs_cm_no_server(call, &srx); return afs_queue_call_work(call); } @@ -303,7 +281,8 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work) _enter("{%p}", call->cm_server); - afs_init_callback_state(call->cm_server); + if (call->cm_server) + afs_init_callback_state(call->cm_server); afs_send_empty_reply(call); afs_put_call(call); _leave(""); @@ -315,7 +294,6 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work) static int afs_deliver_cb_init_call_back_state(struct afs_call *call) { struct sockaddr_rxrpc srx; - struct afs_server *server; int ret; _enter(""); @@ -328,10 +306,9 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call) /* we'll need the file server record as that tells us which set of * vnodes to operate upon */ - server = afs_find_server(call->net, &srx); - if (!server) - return -ENOTCONN; - call->cm_server = server; + call->cm_server = afs_find_server(call->net, &srx); + if (!call->cm_server) + trace_afs_cm_no_server(call, &srx); return afs_queue_call_work(call); } @@ -341,8 +318,6 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call) */ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) { - struct sockaddr_rxrpc srx; - struct afs_server *server; struct afs_uuid *r; unsigned loop; __be32 *b; @@ -398,11 +373,11 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) /* we'll need the file server record as that tells us which set of * vnodes to operate upon */ - rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx); - server = afs_find_server(call->net, &srx); - if (!server) - return -ENOTCONN; - call->cm_server = server; + rcu_read_lock(); + call->cm_server = afs_find_server_by_uuid(call->net, call->request); + rcu_read_unlock(); + if (!call->cm_server) + trace_afs_cm_no_server_u(call, call->request); return afs_queue_call_work(call); } diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 5889f70d4d27..7d623008157f 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -180,6 +180,7 @@ static int afs_dir_open(struct inode *inode, struct file *file) * get reclaimed during the iteration. */ static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key) + __acquires(&dvnode->validate_lock) { struct afs_read *req; loff_t i_size; @@ -261,18 +262,21 @@ retry: /* If we're going to reload, we need to lock all the pages to prevent * races. */ - if (!test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) { - ret = -ERESTARTSYS; - for (i = 0; i < req->nr_pages; i++) - if (lock_page_killable(req->pages[i]) < 0) - goto error_unlock; + ret = -ERESTARTSYS; + if (down_read_killable(&dvnode->validate_lock) < 0) + goto error; - if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) - goto success; + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + goto success; + + up_read(&dvnode->validate_lock); + if (down_write_killable(&dvnode->validate_lock) < 0) + goto error; + if (!test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) { ret = afs_fetch_data(dvnode, key, req); if (ret < 0) - goto error_unlock_all; + goto error_unlock; task_io_account_read(PAGE_SIZE * req->nr_pages); @@ -284,33 +288,26 @@ retry: for (i = 0; i < req->nr_pages; i++) if (!afs_dir_check_page(dvnode, req->pages[i], req->actual_len)) - goto error_unlock_all; + goto error_unlock; // TODO: Trim excess pages set_bit(AFS_VNODE_DIR_VALID, &dvnode->flags); } + downgrade_write(&dvnode->validate_lock); success: - i = req->nr_pages; - while (i > 0) - unlock_page(req->pages[--i]); return req; -error_unlock_all: - i = req->nr_pages; error_unlock: - while (i > 0) - unlock_page(req->pages[--i]); + up_write(&dvnode->validate_lock); error: afs_put_read(req); _leave(" = %d", ret); return ERR_PTR(ret); content_has_grown: - i = req->nr_pages; - while (i > 0) - unlock_page(req->pages[--i]); + up_write(&dvnode->validate_lock); afs_put_read(req); goto retry; } @@ -473,6 +470,7 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, } out: + up_read(&dvnode->validate_lock); afs_put_read(req); _leave(" = %d", ret); return ret; @@ -1143,7 +1141,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, dvnode, key)) { while (afs_select_fileserver(&fc)) { - fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(dvnode); afs_fs_create(&fc, dentry->d_name.name, mode, data_version, &newfid, &newstatus, &newcb); } @@ -1213,7 +1211,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, dvnode, key)) { while (afs_select_fileserver(&fc)) { - fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(dvnode); afs_fs_remove(&fc, dentry->d_name.name, true, data_version); } @@ -1316,7 +1314,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, dvnode, key)) { while (afs_select_fileserver(&fc)) { - fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(dvnode); afs_fs_remove(&fc, dentry->d_name.name, false, data_version); } @@ -1373,7 +1371,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, dvnode, key)) { while (afs_select_fileserver(&fc)) { - fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(dvnode); afs_fs_create(&fc, dentry->d_name.name, mode, data_version, &newfid, &newstatus, &newcb); } @@ -1443,8 +1441,8 @@ static int afs_link(struct dentry *from, struct inode *dir, } while (afs_select_fileserver(&fc)) { - fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; - fc.cb_break_2 = vnode->cb_break + vnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(dvnode); + fc.cb_break_2 = afs_calc_vnode_cb_break(vnode); afs_fs_link(&fc, vnode, dentry->d_name.name, data_version); } @@ -1512,7 +1510,7 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry, ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, dvnode, key)) { while (afs_select_fileserver(&fc)) { - fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(dvnode); afs_fs_symlink(&fc, dentry->d_name.name, content, data_version, &newfid, &newstatus); @@ -1588,8 +1586,8 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, } } while (afs_select_fileserver(&fc)) { - fc.cb_break = orig_dvnode->cb_break + orig_dvnode->cb_s_break; - fc.cb_break_2 = new_dvnode->cb_break + new_dvnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(orig_dvnode); + fc.cb_break_2 = afs_calc_vnode_cb_break(new_dvnode); afs_fs_rename(&fc, old_dentry->d_name.name, new_dvnode, new_dentry->d_name.name, orig_data_version, new_data_version); diff --git a/fs/afs/file.c b/fs/afs/file.c index c24c08016dd9..7d4f26198573 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -238,7 +238,7 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *de ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, vnode, key)) { while (afs_select_fileserver(&fc)) { - fc.cb_break = vnode->cb_break + vnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(vnode); afs_fs_fetch_data(&fc, desc); } diff --git a/fs/afs/flock.c b/fs/afs/flock.c index 7a0e017070ec..dc62d15a964b 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -86,7 +86,7 @@ static int afs_set_lock(struct afs_vnode *vnode, struct key *key, ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, vnode, key)) { while (afs_select_fileserver(&fc)) { - fc.cb_break = vnode->cb_break + vnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(vnode); afs_fs_set_lock(&fc, type); } @@ -117,7 +117,7 @@ static int afs_extend_lock(struct afs_vnode *vnode, struct key *key) ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, vnode, key)) { while (afs_select_current_fileserver(&fc)) { - fc.cb_break = vnode->cb_break + vnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(vnode); afs_fs_extend_lock(&fc); } @@ -148,7 +148,7 @@ static int afs_release_lock(struct afs_vnode *vnode, struct key *key) ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, vnode, key)) { while (afs_select_current_fileserver(&fc)) { - fc.cb_break = vnode->cb_break + vnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(vnode); afs_fs_release_lock(&fc); } diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index efacdb7c1dee..b273e1d60478 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -134,6 +134,7 @@ static int xdr_decode_AFSFetchStatus(struct afs_call *call, struct afs_read *read_req) { const struct afs_xdr_AFSFetchStatus *xdr = (const void *)*_bp; + bool inline_error = (call->operation_ID == afs_FS_InlineBulkStatus); u64 data_version, size; u32 type, abort_code; u8 flags = 0; @@ -142,13 +143,32 @@ static int xdr_decode_AFSFetchStatus(struct afs_call *call, if (vnode) write_seqlock(&vnode->cb_lock); + abort_code = ntohl(xdr->abort_code); + if (xdr->if_version != htonl(AFS_FSTATUS_VERSION)) { + if (xdr->if_version == htonl(0) && + abort_code != 0 && + inline_error) { + /* The OpenAFS fileserver has a bug in FS.InlineBulkStatus + * whereby it doesn't set the interface version in the error + * case. + */ + status->abort_code = abort_code; + ret = 0; + goto out; + } + pr_warn("Unknown AFSFetchStatus version %u\n", ntohl(xdr->if_version)); goto bad; } + if (abort_code != 0 && inline_error) { + status->abort_code = abort_code; + ret = 0; + goto out; + } + type = ntohl(xdr->type); - abort_code = ntohl(xdr->abort_code); switch (type) { case AFS_FTYPE_FILE: case AFS_FTYPE_DIR: @@ -165,13 +185,6 @@ static int xdr_decode_AFSFetchStatus(struct afs_call *call, } status->type = type; break; - case AFS_FTYPE_INVALID: - if (abort_code != 0) { - status->abort_code = abort_code; - ret = 0; - goto out; - } - /* Fall through */ default: goto bad; } @@ -248,7 +261,7 @@ static void xdr_decode_AFSCallBack(struct afs_call *call, write_seqlock(&vnode->cb_lock); - if (call->cb_break == (vnode->cb_break + cbi->server->cb_s_break)) { + if (call->cb_break == afs_cb_break_sum(vnode, cbi)) { vnode->cb_version = ntohl(*bp++); cb_expiry = ntohl(*bp++); vnode->cb_type = ntohl(*bp++); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 06194cfe9724..479b7fdda124 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -108,7 +108,7 @@ int afs_fetch_status(struct afs_vnode *vnode, struct key *key, bool new_inode) ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, vnode, key)) { while (afs_select_fileserver(&fc)) { - fc.cb_break = vnode->cb_break + vnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(vnode); afs_fs_fetch_file_status(&fc, NULL, new_inode); } @@ -393,15 +393,18 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) read_seqlock_excl(&vnode->cb_lock); if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { - if (vnode->cb_s_break != vnode->cb_interest->server->cb_s_break) { + if (vnode->cb_s_break != vnode->cb_interest->server->cb_s_break || + vnode->cb_v_break != vnode->volume->cb_v_break) { vnode->cb_s_break = vnode->cb_interest->server->cb_s_break; + vnode->cb_v_break = vnode->volume->cb_v_break; + valid = false; } else if (vnode->status.type == AFS_FTYPE_DIR && test_bit(AFS_VNODE_DIR_VALID, &vnode->flags) && vnode->cb_expires_at - 10 > now) { - valid = true; + valid = true; } else if (!test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) && vnode->cb_expires_at - 10 > now) { - valid = true; + valid = true; } } else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { valid = true; @@ -415,7 +418,7 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) if (valid) goto valid; - mutex_lock(&vnode->validate_lock); + down_write(&vnode->validate_lock); /* if the promise has expired, we need to check the server again to get * a new promise - note that if the (parent) directory's metadata was @@ -444,13 +447,13 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) * different */ if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) afs_zap_data(vnode); - mutex_unlock(&vnode->validate_lock); + up_write(&vnode->validate_lock); valid: _leave(" = 0"); return 0; error_unlock: - mutex_unlock(&vnode->validate_lock); + up_write(&vnode->validate_lock); _leave(" = %d", ret); return ret; } @@ -574,7 +577,7 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr) ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, vnode, key)) { while (afs_select_fileserver(&fc)) { - fc.cb_break = vnode->cb_break + vnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(vnode); afs_fs_setattr(&fc, attr); } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index f8086ec95e24..e3f8a46663db 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -396,6 +396,7 @@ struct afs_server { #define AFS_SERVER_FL_PROBED 5 /* The fileserver has been probed */ #define AFS_SERVER_FL_PROBING 6 /* Fileserver is being probed */ #define AFS_SERVER_FL_NO_IBULK 7 /* Fileserver doesn't support FS.InlineBulkStatus */ +#define AFS_SERVER_FL_MAY_HAVE_CB 8 /* May have callbacks on this fileserver */ atomic_t usage; u32 addr_version; /* Address list version */ @@ -433,6 +434,7 @@ struct afs_server_list { unsigned short index; /* Server currently in use */ unsigned short vnovol_mask; /* Servers to be skipped due to VNOVOL */ unsigned int seq; /* Set to ->servers_seq when installed */ + rwlock_t lock; struct afs_server_entry servers[]; }; @@ -459,6 +461,9 @@ struct afs_volume { rwlock_t servers_lock; /* Lock for ->servers */ unsigned int servers_seq; /* Incremented each time ->servers changes */ + unsigned cb_v_break; /* Break-everything counter. */ + rwlock_t cb_break_lock; + afs_voltype_t type; /* type of volume */ short error; char type_force; /* force volume type (suppress R/O -> R/W) */ @@ -494,7 +499,7 @@ struct afs_vnode { #endif struct afs_permits __rcu *permit_cache; /* cache of permits so far obtained */ struct mutex io_lock; /* Lock for serialising I/O on this mutex */ - struct mutex validate_lock; /* lock for validating this vnode */ + struct rw_semaphore validate_lock; /* lock for validating this vnode */ spinlock_t wb_lock; /* lock for wb_keys */ spinlock_t lock; /* waitqueue/flags lock */ unsigned long flags; @@ -519,6 +524,7 @@ struct afs_vnode { /* outstanding callback notification on this file */ struct afs_cb_interest *cb_interest; /* Server on which this resides */ unsigned int cb_s_break; /* Mass break counter on ->server */ + unsigned int cb_v_break; /* Mass break counter on ->volume */ unsigned int cb_break; /* Break counter on vnode */ seqlock_t cb_lock; /* Lock for ->cb_interest, ->status, ->cb_*break */ @@ -648,16 +654,29 @@ extern void afs_init_callback_state(struct afs_server *); extern void afs_break_callback(struct afs_vnode *); extern void afs_break_callbacks(struct afs_server *, size_t, struct afs_callback_break*); -extern int afs_register_server_cb_interest(struct afs_vnode *, struct afs_server_entry *); +extern int afs_register_server_cb_interest(struct afs_vnode *, + struct afs_server_list *, unsigned int); extern void afs_put_cb_interest(struct afs_net *, struct afs_cb_interest *); extern void afs_clear_callback_interests(struct afs_net *, struct afs_server_list *); static inline struct afs_cb_interest *afs_get_cb_interest(struct afs_cb_interest *cbi) { - refcount_inc(&cbi->usage); + if (cbi) + refcount_inc(&cbi->usage); return cbi; } +static inline unsigned int afs_calc_vnode_cb_break(struct afs_vnode *vnode) +{ + return vnode->cb_break + vnode->cb_s_break + vnode->cb_v_break; +} + +static inline unsigned int afs_cb_break_sum(struct afs_vnode *vnode, + struct afs_cb_interest *cbi) +{ + return vnode->cb_break + cbi->server->cb_s_break + vnode->volume->cb_v_break; +} + /* * cell.c */ diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 839a22280606..3aad32762989 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -62,7 +62,6 @@ static const struct file_operations afs_proc_rootcell_fops = { .llseek = no_llseek, }; -static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file); static void *afs_proc_cell_volumes_start(struct seq_file *p, loff_t *pos); static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v, loff_t *pos); @@ -76,15 +75,6 @@ static const struct seq_operations afs_proc_cell_volumes_ops = { .show = afs_proc_cell_volumes_show, }; -static const struct file_operations afs_proc_cell_volumes_fops = { - .open = afs_proc_cell_volumes_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int afs_proc_cell_vlservers_open(struct inode *inode, - struct file *file); static void *afs_proc_cell_vlservers_start(struct seq_file *p, loff_t *pos); static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v, loff_t *pos); @@ -98,14 +88,6 @@ static const struct seq_operations afs_proc_cell_vlservers_ops = { .show = afs_proc_cell_vlservers_show, }; -static const struct file_operations afs_proc_cell_vlservers_fops = { - .open = afs_proc_cell_vlservers_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int afs_proc_servers_open(struct inode *inode, struct file *file); static void *afs_proc_servers_start(struct seq_file *p, loff_t *pos); static void *afs_proc_servers_next(struct seq_file *p, void *v, loff_t *pos); @@ -119,13 +101,6 @@ static const struct seq_operations afs_proc_servers_ops = { .show = afs_proc_servers_show, }; -static const struct file_operations afs_proc_servers_fops = { - .open = afs_proc_servers_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static int afs_proc_sysname_open(struct inode *inode, struct file *file); static int afs_proc_sysname_release(struct inode *inode, struct file *file); static void *afs_proc_sysname_start(struct seq_file *p, loff_t *pos); @@ -152,7 +127,7 @@ static const struct file_operations afs_proc_sysname_fops = { .write = afs_proc_sysname_write, }; -static const struct file_operations afs_proc_stats_fops; +static int afs_proc_stats_show(struct seq_file *m, void *v); /* * initialise the /proc/fs/afs/ directory @@ -167,8 +142,8 @@ int afs_proc_init(struct afs_net *net) if (!proc_create("cells", 0644, net->proc_afs, &afs_proc_cells_fops) || !proc_create("rootcell", 0644, net->proc_afs, &afs_proc_rootcell_fops) || - !proc_create("servers", 0644, net->proc_afs, &afs_proc_servers_fops) || - !proc_create("stats", 0644, net->proc_afs, &afs_proc_stats_fops) || + !proc_create_seq("servers", 0644, net->proc_afs, &afs_proc_servers_ops) || + !proc_create_single("stats", 0644, net->proc_afs, afs_proc_stats_show) || !proc_create("sysname", 0644, net->proc_afs, &afs_proc_sysname_fops)) goto error_tree; @@ -196,16 +171,7 @@ void afs_proc_cleanup(struct afs_net *net) */ static int afs_proc_cells_open(struct inode *inode, struct file *file) { - struct seq_file *m; - int ret; - - ret = seq_open(file, &afs_proc_cells_ops); - if (ret < 0) - return ret; - - m = file->private_data; - m->private = PDE_DATA(inode); - return 0; + return seq_open(file, &afs_proc_cells_ops); } /* @@ -430,10 +396,11 @@ int afs_proc_cell_setup(struct afs_net *net, struct afs_cell *cell) if (!dir) goto error_dir; - if (!proc_create_data("vlservers", 0, dir, - &afs_proc_cell_vlservers_fops, cell) || - !proc_create_data("volumes", 0, dir, - &afs_proc_cell_volumes_fops, cell)) + if (!proc_create_seq_data("vlservers", 0, dir, + &afs_proc_cell_vlservers_ops, cell)) + goto error_tree; + if (!proc_create_seq_data("volumes", 0, dir, &afs_proc_cell_volumes_ops, + cell)) goto error_tree; _leave(" = 0"); @@ -459,36 +426,13 @@ void afs_proc_cell_remove(struct afs_net *net, struct afs_cell *cell) } /* - * open "/proc/fs/afs/<cell>/volumes" which provides a summary of extant cells - */ -static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file) -{ - struct afs_cell *cell; - struct seq_file *m; - int ret; - - cell = PDE_DATA(inode); - if (!cell) - return -ENOENT; - - ret = seq_open(file, &afs_proc_cell_volumes_ops); - if (ret < 0) - return ret; - - m = file->private_data; - m->private = cell; - - return 0; -} - -/* * set up the iterator to start reading from the cells list and return the * first item */ static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos) __acquires(cell->proc_lock) { - struct afs_cell *cell = m->private; + struct afs_cell *cell = PDE_DATA(file_inode(m->file)); _enter("cell=%p pos=%Ld", cell, *_pos); @@ -502,7 +446,7 @@ static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos) static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v, loff_t *_pos) { - struct afs_cell *cell = p->private; + struct afs_cell *cell = PDE_DATA(file_inode(p->file)); _enter("cell=%p pos=%Ld", cell, *_pos); return seq_list_next(v, &cell->proc_volumes, _pos); @@ -514,7 +458,7 @@ static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v, static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v) __releases(cell->proc_lock) { - struct afs_cell *cell = p->private; + struct afs_cell *cell = PDE_DATA(file_inode(p->file)); read_unlock(&cell->proc_lock); } @@ -530,7 +474,7 @@ static const char afs_vol_types[3][3] = { */ static int afs_proc_cell_volumes_show(struct seq_file *m, void *v) { - struct afs_cell *cell = m->private; + struct afs_cell *cell = PDE_DATA(file_inode(m->file)); struct afs_volume *vol = list_entry(v, struct afs_volume, proc_link); /* Display header on line 1 */ @@ -547,30 +491,6 @@ static int afs_proc_cell_volumes_show(struct seq_file *m, void *v) } /* - * open "/proc/fs/afs/<cell>/vlservers" which provides a list of volume - * location server - */ -static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file) -{ - struct afs_cell *cell; - struct seq_file *m; - int ret; - - cell = PDE_DATA(inode); - if (!cell) - return -ENOENT; - - ret = seq_open(file, &afs_proc_cell_vlservers_ops); - if (ret<0) - return ret; - - m = file->private_data; - m->private = cell; - - return 0; -} - -/* * set up the iterator to start reading from the cells list and return the * first item */ @@ -578,7 +498,7 @@ static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos) __acquires(rcu) { struct afs_addr_list *alist; - struct afs_cell *cell = m->private; + struct afs_cell *cell = PDE_DATA(file_inode(m->file)); loff_t pos = *_pos; rcu_read_lock(); @@ -603,7 +523,7 @@ static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v, loff_t *_pos) { struct afs_addr_list *alist; - struct afs_cell *cell = p->private; + struct afs_cell *cell = PDE_DATA(file_inode(p->file)); loff_t pos; alist = rcu_dereference(cell->vl_addrs); @@ -644,15 +564,6 @@ static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v) } /* - * open "/proc/fs/afs/servers" which provides a summary of active - * servers - */ -static int afs_proc_servers_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &afs_proc_servers_ops); -} - -/* * Set up the iterator to start reading from the server list and return the * first item. */ @@ -931,18 +842,3 @@ static int afs_proc_stats_show(struct seq_file *m, void *v) atomic_long_read(&net->n_store_bytes)); return 0; } - -/* - * Open "/proc/fs/afs/stats" to allow reading of the stat counters. - */ -static int afs_proc_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, afs_proc_stats_show, NULL); -} - -static const struct file_operations afs_proc_stats_fops = { - .open = afs_proc_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index ac0feac9d746..e065bc0768e6 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -179,7 +179,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) */ if (fc->flags & AFS_FS_CURSOR_VNOVOL) { fc->ac.error = -EREMOTEIO; - goto failed; + goto next_server; } write_lock(&vnode->volume->servers_lock); @@ -201,7 +201,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) */ if (vnode->volume->servers == fc->server_list) { fc->ac.error = -EREMOTEIO; - goto failed; + goto next_server; } /* Try again */ @@ -350,8 +350,8 @@ use_server: * break request before we've finished decoding the reply and * installing the vnode. */ - fc->ac.error = afs_register_server_cb_interest( - vnode, &fc->server_list->servers[fc->index]); + fc->ac.error = afs_register_server_cb_interest(vnode, fc->server_list, + fc->index); if (fc->ac.error < 0) goto failed; @@ -369,8 +369,16 @@ use_server: if (!test_bit(AFS_SERVER_FL_PROBED, &server->flags)) { fc->ac.alist = afs_get_addrlist(alist); - if (!afs_probe_fileserver(fc)) - goto failed; + if (!afs_probe_fileserver(fc)) { + switch (fc->ac.error) { + case -ENOMEM: + case -ERESTARTSYS: + case -EINTR: + goto failed; + default: + goto next_server; + } + } } if (!fc->ac.alist) diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 5c6263972ec9..08735948f15d 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -41,6 +41,7 @@ int afs_open_socket(struct afs_net *net) { struct sockaddr_rxrpc srx; struct socket *socket; + unsigned int min_level; int ret; _enter(""); @@ -60,6 +61,12 @@ int afs_open_socket(struct afs_net *net) srx.transport.sin6.sin6_family = AF_INET6; srx.transport.sin6.sin6_port = htons(AFS_CM_PORT); + min_level = RXRPC_SECURITY_ENCRYPT; + ret = kernel_setsockopt(socket, SOL_RXRPC, RXRPC_MIN_SECURITY_LEVEL, + (void *)&min_level, sizeof(min_level)); + if (ret < 0) + goto error_2; + ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); if (ret == -EADDRINUSE) { srx.transport.sin6.sin6_port = 0; @@ -482,8 +489,12 @@ static void afs_deliver_to_call(struct afs_call *call) state = READ_ONCE(call->state); switch (ret) { case 0: - if (state == AFS_CALL_CL_PROC_REPLY) + if (state == AFS_CALL_CL_PROC_REPLY) { + if (call->cbi) + set_bit(AFS_SERVER_FL_MAY_HAVE_CB, + &call->cbi->server->flags); goto call_complete; + } ASSERTCMP(state, >, AFS_CALL_CL_PROC_REPLY); goto done; case -EINPROGRESS: @@ -493,11 +504,6 @@ static void afs_deliver_to_call(struct afs_call *call) case -ECONNABORTED: ASSERTCMP(state, ==, AFS_CALL_COMPLETE); goto done; - case -ENOTCONN: - abort_code = RX_CALL_DEAD; - rxrpc_kernel_abort_call(call->net->socket, call->rxcall, - abort_code, ret, "KNC"); - goto local_abort; case -ENOTSUPP: abort_code = RXGEN_OPCODE; rxrpc_kernel_abort_call(call->net->socket, call->rxcall, diff --git a/fs/afs/security.c b/fs/afs/security.c index cea2fff313dc..81dfedb7879f 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -147,8 +147,7 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, break; } - if (cb_break != (vnode->cb_break + - vnode->cb_interest->server->cb_s_break)) { + if (cb_break != afs_cb_break_sum(vnode, vnode->cb_interest)) { changed = true; break; } @@ -178,7 +177,7 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, } } - if (cb_break != (vnode->cb_break + vnode->cb_interest->server->cb_s_break)) + if (cb_break != afs_cb_break_sum(vnode, vnode->cb_interest)) goto someone_else_changed_it; /* We need a ref on any permits list we want to copy as we'll have to @@ -257,7 +256,7 @@ found: spin_lock(&vnode->lock); zap = rcu_access_pointer(vnode->permit_cache); - if (cb_break == (vnode->cb_break + vnode->cb_interest->server->cb_s_break) && + if (cb_break == afs_cb_break_sum(vnode, vnode->cb_interest) && zap == permits) rcu_assign_pointer(vnode->permit_cache, replacement); else @@ -373,18 +372,14 @@ int afs_permission(struct inode *inode, int mask) mask, access, S_ISDIR(inode->i_mode) ? "dir" : "file"); if (S_ISDIR(inode->i_mode)) { - if (mask & MAY_EXEC) { + if (mask & (MAY_EXEC | MAY_READ | MAY_CHDIR)) { if (!(access & AFS_ACE_LOOKUP)) goto permission_denied; - } else if (mask & MAY_READ) { - if (!(access & AFS_ACE_LOOKUP)) - goto permission_denied; - } else if (mask & MAY_WRITE) { + } + if (mask & MAY_WRITE) { if (!(access & (AFS_ACE_DELETE | /* rmdir, unlink, rename from */ AFS_ACE_INSERT))) /* create, mkdir, symlink, rename to */ goto permission_denied; - } else { - BUG(); } } else { if (!(access & AFS_ACE_LOOKUP)) diff --git a/fs/afs/server.c b/fs/afs/server.c index e23be63998a8..3af4625e2f8c 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -67,12 +67,6 @@ struct afs_server *afs_find_server(struct afs_net *net, sizeof(struct in6_addr)); if (diff == 0) goto found; - if (diff < 0) { - // TODO: Sort the list - //if (i == alist->nr_ipv4) - // goto not_found; - break; - } } } } else { @@ -87,17 +81,10 @@ struct afs_server *afs_find_server(struct afs_net *net, (u32 __force)b->sin6_addr.s6_addr32[3]); if (diff == 0) goto found; - if (diff < 0) { - // TODO: Sort the list - //if (i == 0) - // goto not_found; - break; - } } } } - //not_found: server = NULL; found: if (server && !atomic_inc_not_zero(&server->usage)) @@ -395,14 +382,16 @@ static void afs_destroy_server(struct afs_net *net, struct afs_server *server) struct afs_addr_list *alist = rcu_access_pointer(server->addresses); struct afs_addr_cursor ac = { .alist = alist, - .addr = &alist->addrs[0], .start = alist->index, - .index = alist->index, + .index = 0, + .addr = &alist->addrs[alist->index], .error = 0, }; _enter("%p", server); - afs_fs_give_up_all_callbacks(net, server, &ac, NULL); + if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags)) + afs_fs_give_up_all_callbacks(net, server, &ac, NULL); + call_rcu(&server->rcu, afs_server_rcu); afs_dec_servers_outstanding(net); } @@ -428,8 +417,15 @@ static void afs_gc_servers(struct afs_net *net, struct afs_server *gc_list) } write_sequnlock(&net->fs_lock); - if (deleted) + if (deleted) { + write_seqlock(&net->fs_addr_lock); + if (!hlist_unhashed(&server->addr4_link)) + hlist_del_rcu(&server->addr4_link); + if (!hlist_unhashed(&server->addr6_link)) + hlist_del_rcu(&server->addr6_link); + write_sequnlock(&net->fs_addr_lock); afs_destroy_server(net, server); + } } } diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c index 0f8dc4c8f07c..8a5760aa5832 100644 --- a/fs/afs/server_list.c +++ b/fs/afs/server_list.c @@ -49,6 +49,7 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell, goto error; refcount_set(&slist->usage, 1); + rwlock_init(&slist->lock); /* Make sure a records exists for each server in the list. */ for (i = 0; i < vldb->nr_servers; i++) { @@ -64,9 +65,11 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell, goto error_2; } - /* Insertion-sort by server pointer */ + /* Insertion-sort by UUID */ for (j = 0; j < slist->nr_servers; j++) - if (slist->servers[j].server >= server) + if (memcmp(&slist->servers[j].server->uuid, + &server->uuid, + sizeof(server->uuid)) >= 0) break; if (j < slist->nr_servers) { if (slist->servers[j].server == server) { diff --git a/fs/afs/super.c b/fs/afs/super.c index 65081ec3c36e..9e5d7966621c 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -590,7 +590,7 @@ static void afs_i_init_once(void *_vnode) memset(vnode, 0, sizeof(*vnode)); inode_init_once(&vnode->vfs_inode); mutex_init(&vnode->io_lock); - mutex_init(&vnode->validate_lock); + init_rwsem(&vnode->validate_lock); spin_lock_init(&vnode->wb_lock); spin_lock_init(&vnode->lock); INIT_LIST_HEAD(&vnode->wb_keys); @@ -688,7 +688,7 @@ static int afs_statfs(struct dentry *dentry, struct kstatfs *buf) if (afs_begin_vnode_operation(&fc, vnode, key)) { fc.flags |= AFS_FS_CURSOR_NO_VSLEEP; while (afs_select_fileserver(&fc)) { - fc.cb_break = vnode->cb_break + vnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(vnode); afs_fs_get_volume_status(&fc, &vs); } diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index 1ed7e2fd2f35..c3b740813fc7 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -23,7 +23,7 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call) struct afs_uvldbentry__xdr *uvldb; struct afs_vldb_entry *entry; bool new_only = false; - u32 tmp, nr_servers; + u32 tmp, nr_servers, vlflags; int i, ret; _enter(""); @@ -55,6 +55,7 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call) new_only = true; } + vlflags = ntohl(uvldb->flags); for (i = 0; i < nr_servers; i++) { struct afs_uuid__xdr *xdr; struct afs_uuid *uuid; @@ -64,12 +65,13 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call) if (tmp & AFS_VLSF_DONTUSE || (new_only && !(tmp & AFS_VLSF_NEWREPSITE))) continue; - if (tmp & AFS_VLSF_RWVOL) + if (tmp & AFS_VLSF_RWVOL) { entry->fs_mask[i] |= AFS_VOL_VTM_RW; + if (vlflags & AFS_VLF_BACKEXISTS) + entry->fs_mask[i] |= AFS_VOL_VTM_BAK; + } if (tmp & AFS_VLSF_ROVOL) entry->fs_mask[i] |= AFS_VOL_VTM_RO; - if (tmp & AFS_VLSF_BACKVOL) - entry->fs_mask[i] |= AFS_VOL_VTM_BAK; if (!entry->fs_mask[i]) continue; @@ -89,15 +91,14 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call) for (i = 0; i < AFS_MAXTYPES; i++) entry->vid[i] = ntohl(uvldb->volumeId[i]); - tmp = ntohl(uvldb->flags); - if (tmp & AFS_VLF_RWEXISTS) + if (vlflags & AFS_VLF_RWEXISTS) __set_bit(AFS_VLDB_HAS_RW, &entry->flags); - if (tmp & AFS_VLF_ROEXISTS) + if (vlflags & AFS_VLF_ROEXISTS) __set_bit(AFS_VLDB_HAS_RO, &entry->flags); - if (tmp & AFS_VLF_BACKEXISTS) + if (vlflags & AFS_VLF_BACKEXISTS) __set_bit(AFS_VLDB_HAS_BAK, &entry->flags); - if (!(tmp & (AFS_VLF_RWEXISTS | AFS_VLF_ROEXISTS | AFS_VLF_BACKEXISTS))) { + if (!(vlflags & (AFS_VLF_RWEXISTS | AFS_VLF_ROEXISTS | AFS_VLF_BACKEXISTS))) { entry->error = -ENOMEDIUM; __set_bit(AFS_VLDB_QUERY_ERROR, &entry->flags); } diff --git a/fs/afs/write.c b/fs/afs/write.c index c164698dc304..8b39e6ebb40b 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -351,7 +351,7 @@ found_key: ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, vnode, wbk->key)) { while (afs_select_fileserver(&fc)) { - fc.cb_break = vnode->cb_break + vnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(vnode); afs_fs_store_data(&fc, mapping, first, last, offset, to); } @@ -634,9 +634,8 @@ static void free_ioctx_users(struct percpu_ref *ref) while (!list_empty(&ctx->active_reqs)) { req = list_first_entry(&ctx->active_reqs, struct aio_kiocb, ki_list); - - list_del_init(&req->ki_list); kiocb_cancel(req); + list_del_init(&req->ki_list); } spin_unlock_irq(&ctx->ctx_lock); @@ -1078,8 +1077,8 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) ctx = rcu_dereference(table->table[id]); if (ctx && ctx->user_id == ctx_id) { - percpu_ref_get(&ctx->users); - ret = ctx; + if (percpu_ref_tryget_live(&ctx->users)) + ret = ctx; } out: rcu_read_unlock(); diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 82e8f6edfb48..b12e37f27530 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -749,7 +749,7 @@ static int autofs4_dir_mkdir(struct inode *dir, autofs4_del_active(dentry); - inode = autofs4_get_inode(dir->i_sb, S_IFDIR | 0555); + inode = autofs4_get_inode(dir->i_sb, S_IFDIR | mode); if (!inode) return -ENOMEM; d_add(dentry, inode); diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index af2832aaeec5..4700b4534439 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -198,23 +198,16 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) if (ret == BEFS_BT_NOT_FOUND) { befs_debug(sb, "<--- %s %pd not found", __func__, dentry); - d_add(dentry, NULL); - return ERR_PTR(-ENOENT); - + inode = NULL; } else if (ret != BEFS_OK || offset == 0) { befs_error(sb, "<--- %s Error", __func__); - return ERR_PTR(-ENODATA); + inode = ERR_PTR(-ENODATA); + } else { + inode = befs_iget(dir->i_sb, (ino_t) offset); } - - inode = befs_iget(dir->i_sb, (ino_t) offset); - if (IS_ERR(inode)) - return ERR_CAST(inode); - - d_add(dentry, inode); - befs_debug(sb, "<--- %s", __func__); - return NULL; + return d_splice_alias(inode, dentry); } static int diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 41e04183e4ce..4ad6f669fe34 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -377,10 +377,10 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, } else map_addr = vm_mmap(filep, addr, size, prot, type, off); - if ((type & MAP_FIXED_NOREPLACE) && BAD_ADDR(map_addr)) - pr_info("%d (%s): Uhuuh, elf segment at %p requested but the memory is mapped already\n", - task_pid_nr(current), current->comm, - (void *)addr); + if ((type & MAP_FIXED_NOREPLACE) && + PTR_ERR((void *)map_addr) == -EEXIST) + pr_info("%d (%s): Uhuuh, elf segment at %px requested but the memory is mapped already\n", + task_pid_nr(current), current->comm, (void *)addr); return(map_addr); } diff --git a/fs/block_dev.c b/fs/block_dev.c index 7ec920e27065..bef6934b6189 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -272,7 +272,7 @@ struct blkdev_dio { struct bio bio; }; -static struct bio_set *blkdev_dio_pool __read_mostly; +static struct bio_set blkdev_dio_pool; static void blkdev_bio_end_io(struct bio *bio) { @@ -334,7 +334,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) (bdev_logical_block_size(bdev) - 1)) return -EINVAL; - bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, blkdev_dio_pool); + bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool); bio_get(bio); /* extra ref for the completion handler */ dio = container_of(bio, struct blkdev_dio, bio); @@ -432,10 +432,7 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) static __init int blkdev_init(void) { - blkdev_dio_pool = bioset_create(4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS); - if (!blkdev_dio_pool) - return -ENOMEM; - return 0; + return bioset_init(&blkdev_dio_pool, 4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS); } module_init(blkdev_init); @@ -1322,27 +1319,30 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty) * check_disk_size_change - checks for disk size change and adjusts bdev size. * @disk: struct gendisk to check * @bdev: struct bdev to adjust. + * @verbose: if %true log a message about a size change if there is any * * This routine checks to see if the bdev size does not match the disk size * and adjusts it if it differs. When shrinking the bdev size, its all caches * are freed. */ -void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) +void check_disk_size_change(struct gendisk *disk, struct block_device *bdev, + bool verbose) { loff_t disk_size, bdev_size; disk_size = (loff_t)get_capacity(disk) << 9; bdev_size = i_size_read(bdev->bd_inode); if (disk_size != bdev_size) { - printk(KERN_INFO - "%s: detected capacity change from %lld to %lld\n", - disk->disk_name, bdev_size, disk_size); + if (verbose) { + printk(KERN_INFO + "%s: detected capacity change from %lld to %lld\n", + disk->disk_name, bdev_size, disk_size); + } i_size_write(bdev->bd_inode, disk_size); if (bdev_size > disk_size) flush_disk(bdev, false); } } -EXPORT_SYMBOL(check_disk_size_change); /** * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back @@ -1364,7 +1364,7 @@ int revalidate_disk(struct gendisk *disk) return ret; mutex_lock(&bdev->bd_mutex); - check_disk_size_change(disk, bdev); + check_disk_size_change(disk, bdev, ret == 0); bdev->bd_invalidated = 0; mutex_unlock(&bdev->bd_mutex); bdput(bdev); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 3fd44835b386..8c68961925b1 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2436,10 +2436,8 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, if (p->reada != READA_NONE) reada_for_search(fs_info, p, level, slot, key->objectid); - btrfs_release_path(p); - ret = -EAGAIN; - tmp = read_tree_block(fs_info, blocknr, 0, parent_level - 1, + tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1, &first_key); if (!IS_ERR(tmp)) { /* @@ -2454,6 +2452,8 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, } else { ret = PTR_ERR(tmp); } + + btrfs_release_path(p); return ret; } @@ -5414,12 +5414,24 @@ int btrfs_compare_trees(struct btrfs_root *left_root, down_read(&fs_info->commit_root_sem); left_level = btrfs_header_level(left_root->commit_root); left_root_level = left_level; - left_path->nodes[left_level] = left_root->commit_root; + left_path->nodes[left_level] = + btrfs_clone_extent_buffer(left_root->commit_root); + if (!left_path->nodes[left_level]) { + up_read(&fs_info->commit_root_sem); + ret = -ENOMEM; + goto out; + } extent_buffer_get(left_path->nodes[left_level]); right_level = btrfs_header_level(right_root->commit_root); right_root_level = right_level; - right_path->nodes[right_level] = right_root->commit_root; + right_path->nodes[right_level] = + btrfs_clone_extent_buffer(right_root->commit_root); + if (!right_path->nodes[right_level]) { + up_read(&fs_info->commit_root_sem); + ret = -ENOMEM; + goto out; + } extent_buffer_get(right_path->nodes[right_level]); up_read(&fs_info->commit_root_sem); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5474ef14d6e6..0d422c9908b8 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -459,6 +459,25 @@ struct btrfs_block_rsv { unsigned short full; unsigned short type; unsigned short failfast; + + /* + * Qgroup equivalent for @size @reserved + * + * Unlike normal @size/@reserved for inode rsv, qgroup doesn't care + * about things like csum size nor how many tree blocks it will need to + * reserve. + * + * Qgroup cares more about net change of the extent usage. + * + * So for one newly inserted file extent, in worst case it will cause + * leaf split and level increase, nodesize for each file extent is + * already too much. + * + * In short, qgroup_size/reserved is the upper limit of possible needed + * qgroup metadata reservation. + */ + u64 qgroup_rsv_size; + u64 qgroup_rsv_reserved; }; /* @@ -714,6 +733,12 @@ struct btrfs_delayed_root; */ #define BTRFS_FS_EXCL_OP 16 +/* + * To info transaction_kthread we need an immediate commit so it doesn't + * need to wait for commit_interval + */ +#define BTRFS_FS_NEED_ASYNC_COMMIT 17 + struct btrfs_fs_info { u8 fsid[BTRFS_FSID_SIZE]; u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; @@ -3157,6 +3182,8 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, u64 *ram_bytes); +void __btrfs_del_delalloc_inode(struct btrfs_root *root, + struct btrfs_inode *inode); struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index); int btrfs_unlink_inode(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 06ec8ab6d9ba..a8d492dbd3e7 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -556,6 +556,12 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, dst_rsv = &fs_info->delayed_block_rsv; num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); + + /* + * Here we migrate space rsv from transaction rsv, since have already + * reserved space when starting a transaction. So no need to reserve + * qgroup space here. + */ ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); if (!ret) { trace_btrfs_space_reservation(fs_info, "delayed_item", @@ -577,7 +583,10 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, return; rsv = &fs_info->delayed_block_rsv; - btrfs_qgroup_convert_reserved_meta(root, item->bytes_reserved); + /* + * Check btrfs_delayed_item_reserve_metadata() to see why we don't need + * to release/reserve qgroup space. + */ trace_btrfs_space_reservation(fs_info, "delayed_item", item->key.objectid, item->bytes_reserved, 0); @@ -602,9 +611,6 @@ static int btrfs_delayed_inode_reserve_metadata( num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); - ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true); - if (ret < 0) - return ret; /* * btrfs_dirty_inode will update the inode under btrfs_join_transaction * which doesn't reserve space for speed. This is a problem since we @@ -616,6 +622,10 @@ static int btrfs_delayed_inode_reserve_metadata( */ if (!src_rsv || (!trans->bytes_reserved && src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { + ret = btrfs_qgroup_reserve_meta_prealloc(root, + fs_info->nodesize, true); + if (ret < 0) + return ret; ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, BTRFS_RESERVE_NO_FLUSH); /* @@ -634,6 +644,8 @@ static int btrfs_delayed_inode_reserve_metadata( "delayed_inode", btrfs_ino(inode), num_bytes, 1); + } else { + btrfs_qgroup_free_meta_prealloc(root, fs_info->nodesize); } return ret; } diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 9e98295de7ce..e1b0651686f7 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -540,8 +540,10 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *head_ref, struct btrfs_qgroup_extent_record *qrecord, u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved, - int action, int is_data, int *qrecord_inserted_ret, + int action, int is_data, int is_system, + int *qrecord_inserted_ret, int *old_ref_mod, int *new_ref_mod) + { struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_root *delayed_refs; @@ -585,6 +587,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, head_ref->ref_mod = count_mod; head_ref->must_insert_reserved = must_insert_reserved; head_ref->is_data = is_data; + head_ref->is_system = is_system; head_ref->ref_tree = RB_ROOT; INIT_LIST_HEAD(&head_ref->ref_add_list); RB_CLEAR_NODE(&head_ref->href_node); @@ -772,6 +775,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; int qrecord_inserted; + int is_system = (ref_root == BTRFS_CHUNK_TREE_OBJECTID); BUG_ON(extent_op && extent_op->is_data); ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); @@ -800,8 +804,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, */ head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record, bytenr, num_bytes, 0, 0, action, 0, - &qrecord_inserted, old_ref_mod, - new_ref_mod); + is_system, &qrecord_inserted, + old_ref_mod, new_ref_mod); add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr, num_bytes, parent, ref_root, level, action); @@ -868,7 +872,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, */ head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record, bytenr, num_bytes, ref_root, reserved, - action, 1, &qrecord_inserted, + action, 1, 0, &qrecord_inserted, old_ref_mod, new_ref_mod); add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr, @@ -898,9 +902,14 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); + /* + * extent_ops just modify the flags of an extent and they don't result + * in ref count changes, hence it's safe to pass false/0 for is_system + * argument + */ add_delayed_ref_head(fs_info, trans, head_ref, NULL, bytenr, num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD, - extent_op->is_data, NULL, NULL, NULL); + extent_op->is_data, 0, NULL, NULL, NULL); spin_unlock(&delayed_refs->lock); return 0; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 741869dbc316..7f00db50bd24 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -127,6 +127,7 @@ struct btrfs_delayed_ref_head { */ unsigned int must_insert_reserved:1; unsigned int is_data:1; + unsigned int is_system:1; unsigned int processing:1; }; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4ac8b1d21baf..c3504b4d281b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1824,6 +1824,7 @@ static int transaction_kthread(void *arg) now = get_seconds(); if (cur->state < TRANS_STATE_BLOCKED && + !test_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags) && (now < cur->start_time || now - cur->start_time < fs_info->commit_interval)) { spin_unlock(&fs_info->trans_lock); @@ -3817,6 +3818,7 @@ void close_ctree(struct btrfs_fs_info *fs_info) set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags); btrfs_free_qgroup_config(fs_info); + ASSERT(list_empty(&fs_info->delalloc_roots)); if (percpu_counter_sum(&fs_info->delalloc_bytes)) { btrfs_info(fs_info, "at unmount delalloc count %lld", @@ -4124,15 +4126,15 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info) static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) { + /* cleanup FS via transaction */ + btrfs_cleanup_transaction(fs_info); + mutex_lock(&fs_info->cleaner_mutex); btrfs_run_delayed_iputs(fs_info); mutex_unlock(&fs_info->cleaner_mutex); down_write(&fs_info->cleanup_work_sem); up_write(&fs_info->cleanup_work_sem); - - /* cleanup FS via transaction */ - btrfs_cleanup_transaction(fs_info); } static void btrfs_destroy_ordered_extents(struct btrfs_root *root) @@ -4257,19 +4259,23 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) list_splice_init(&root->delalloc_inodes, &splice); while (!list_empty(&splice)) { + struct inode *inode = NULL; btrfs_inode = list_first_entry(&splice, struct btrfs_inode, delalloc_inodes); - - list_del_init(&btrfs_inode->delalloc_inodes); - clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &btrfs_inode->runtime_flags); + __btrfs_del_delalloc_inode(root, btrfs_inode); spin_unlock(&root->delalloc_lock); - btrfs_invalidate_inodes(btrfs_inode->root); - + /* + * Make sure we get a live inode and that it'll not disappear + * meanwhile. + */ + inode = igrab(&btrfs_inode->vfs_inode); + if (inode) { + invalidate_inode_pages2(inode->i_mapping); + iput(inode); + } spin_lock(&root->delalloc_lock); } - spin_unlock(&root->delalloc_lock); } @@ -4285,7 +4291,6 @@ static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info) while (!list_empty(&splice)) { root = list_first_entry(&splice, struct btrfs_root, delalloc_root); - list_del_init(&root->delalloc_root); root = btrfs_grab_fs_root(root); BUG_ON(!root); spin_unlock(&fs_info->delalloc_root_lock); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 75cfb80d2551..51b5e2da708c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2601,13 +2601,19 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, trace_run_delayed_ref_head(fs_info, head, 0); if (head->total_ref_mod < 0) { - struct btrfs_block_group_cache *cache; + struct btrfs_space_info *space_info; + u64 flags; - cache = btrfs_lookup_block_group(fs_info, head->bytenr); - ASSERT(cache); - percpu_counter_add(&cache->space_info->total_bytes_pinned, + if (head->is_data) + flags = BTRFS_BLOCK_GROUP_DATA; + else if (head->is_system) + flags = BTRFS_BLOCK_GROUP_SYSTEM; + else + flags = BTRFS_BLOCK_GROUP_METADATA; + space_info = __find_space_info(fs_info, flags); + ASSERT(space_info); + percpu_counter_add(&space_info->total_bytes_pinned, -head->num_bytes); - btrfs_put_block_group(cache); if (head->is_data) { spin_lock(&delayed_refs->lock); @@ -3136,7 +3142,11 @@ static noinline int check_delayed_ref(struct btrfs_root *root, struct rb_node *node; int ret = 0; + spin_lock(&root->fs_info->trans_lock); cur_trans = root->fs_info->running_transaction; + if (cur_trans) + refcount_inc(&cur_trans->use_count); + spin_unlock(&root->fs_info->trans_lock); if (!cur_trans) return 0; @@ -3145,6 +3155,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root, head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); if (!head) { spin_unlock(&delayed_refs->lock); + btrfs_put_transaction(cur_trans); return 0; } @@ -3161,6 +3172,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root, mutex_lock(&head->mutex); mutex_unlock(&head->mutex); btrfs_put_delayed_ref_head(head); + btrfs_put_transaction(cur_trans); return -EAGAIN; } spin_unlock(&delayed_refs->lock); @@ -3193,6 +3205,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root, } spin_unlock(&head->lock); mutex_unlock(&head->mutex); + btrfs_put_transaction(cur_trans); return ret; } @@ -5559,14 +5572,18 @@ again: static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, - struct btrfs_block_rsv *dest, u64 num_bytes) + struct btrfs_block_rsv *dest, u64 num_bytes, + u64 *qgroup_to_release_ret) { struct btrfs_space_info *space_info = block_rsv->space_info; + u64 qgroup_to_release = 0; u64 ret; spin_lock(&block_rsv->lock); - if (num_bytes == (u64)-1) + if (num_bytes == (u64)-1) { num_bytes = block_rsv->size; + qgroup_to_release = block_rsv->qgroup_rsv_size; + } block_rsv->size -= num_bytes; if (block_rsv->reserved >= block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; @@ -5575,6 +5592,13 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, } else { num_bytes = 0; } + if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) { + qgroup_to_release = block_rsv->qgroup_rsv_reserved - + block_rsv->qgroup_rsv_size; + block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size; + } else { + qgroup_to_release = 0; + } spin_unlock(&block_rsv->lock); ret = num_bytes; @@ -5597,6 +5621,8 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, space_info_add_old_bytes(fs_info, space_info, num_bytes); } + if (qgroup_to_release_ret) + *qgroup_to_release_ret = qgroup_to_release; return ret; } @@ -5738,17 +5764,21 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, struct btrfs_root *root = inode->root; struct btrfs_block_rsv *block_rsv = &inode->block_rsv; u64 num_bytes = 0; + u64 qgroup_num_bytes = 0; int ret = -ENOSPC; spin_lock(&block_rsv->lock); if (block_rsv->reserved < block_rsv->size) num_bytes = block_rsv->size - block_rsv->reserved; + if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size) + qgroup_num_bytes = block_rsv->qgroup_rsv_size - + block_rsv->qgroup_rsv_reserved; spin_unlock(&block_rsv->lock); if (num_bytes == 0) return 0; - ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true); + ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true); if (ret) return ret; ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); @@ -5756,7 +5786,13 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, block_rsv_add_bytes(block_rsv, num_bytes, 0); trace_btrfs_space_reservation(root->fs_info, "delalloc", btrfs_ino(inode), num_bytes, 1); - } + + /* Don't forget to increase qgroup_rsv_reserved */ + spin_lock(&block_rsv->lock); + block_rsv->qgroup_rsv_reserved += qgroup_num_bytes; + spin_unlock(&block_rsv->lock); + } else + btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); return ret; } @@ -5777,20 +5813,23 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; struct btrfs_block_rsv *block_rsv = &inode->block_rsv; u64 released = 0; + u64 qgroup_to_release = 0; /* * Since we statically set the block_rsv->size we just want to say we * are releasing 0 bytes, and then we'll just get the reservation over * the size free'd. */ - released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0); + released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0, + &qgroup_to_release); if (released > 0) trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode), released, 0); if (qgroup_free) - btrfs_qgroup_free_meta_prealloc(inode->root, released); + btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release); else - btrfs_qgroup_convert_reserved_meta(inode->root, released); + btrfs_qgroup_convert_reserved_meta(inode->root, + qgroup_to_release); } void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, @@ -5802,7 +5841,7 @@ void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, if (global_rsv == block_rsv || block_rsv->space_info != global_rsv->space_info) global_rsv = NULL; - block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes); + block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes, NULL); } static void update_global_block_rsv(struct btrfs_fs_info *fs_info) @@ -5882,7 +5921,7 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) static void release_global_block_rsv(struct btrfs_fs_info *fs_info) { block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, - (u64)-1); + (u64)-1, NULL); WARN_ON(fs_info->trans_block_rsv.size > 0); WARN_ON(fs_info->trans_block_rsv.reserved > 0); WARN_ON(fs_info->chunk_block_rsv.size > 0); @@ -5906,7 +5945,7 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) WARN_ON_ONCE(!list_empty(&trans->new_bgs)); block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL, - trans->chunk_bytes_reserved); + trans->chunk_bytes_reserved, NULL); trans->chunk_bytes_reserved = 0; } @@ -6011,6 +6050,7 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, { struct btrfs_block_rsv *block_rsv = &inode->block_rsv; u64 reserve_size = 0; + u64 qgroup_rsv_size = 0; u64 csum_leaves; unsigned outstanding_extents; @@ -6023,9 +6063,17 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, inode->csum_bytes); reserve_size += btrfs_calc_trans_metadata_size(fs_info, csum_leaves); + /* + * For qgroup rsv, the calculation is very simple: + * account one nodesize for each outstanding extent + * + * This is overestimating in most cases. + */ + qgroup_rsv_size = outstanding_extents * fs_info->nodesize; spin_lock(&block_rsv->lock); block_rsv->size = reserve_size; + block_rsv->qgroup_rsv_size = qgroup_rsv_size; spin_unlock(&block_rsv->lock); } @@ -8403,7 +8451,7 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u32 blocksize) { block_rsv_add_bytes(block_rsv, blocksize, 0); - block_rsv_release_bytes(fs_info, block_rsv, NULL, 0); + block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL); } /* diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e99b329002cf..56d32bb462f9 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -26,7 +26,7 @@ static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; -static struct bio_set *btrfs_bioset; +static struct bio_set btrfs_bioset; static inline bool extent_state_in_tree(const struct extent_state *state) { @@ -162,20 +162,18 @@ int __init extent_io_init(void) if (!extent_buffer_cache) goto free_state_cache; - btrfs_bioset = bioset_create(BIO_POOL_SIZE, - offsetof(struct btrfs_io_bio, bio), - BIOSET_NEED_BVECS); - if (!btrfs_bioset) + if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, + offsetof(struct btrfs_io_bio, bio), + BIOSET_NEED_BVECS)) goto free_buffer_cache; - if (bioset_integrity_create(btrfs_bioset, BIO_POOL_SIZE)) + if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE)) goto free_bioset; return 0; free_bioset: - bioset_free(btrfs_bioset); - btrfs_bioset = NULL; + bioset_exit(&btrfs_bioset); free_buffer_cache: kmem_cache_destroy(extent_buffer_cache); @@ -198,8 +196,7 @@ void __cold extent_io_exit(void) rcu_barrier(); kmem_cache_destroy(extent_state_cache); kmem_cache_destroy(extent_buffer_cache); - if (btrfs_bioset) - bioset_free(btrfs_bioset); + bioset_exit(&btrfs_bioset); } void extent_io_tree_init(struct extent_io_tree *tree, @@ -2679,7 +2676,7 @@ struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte) { struct bio *bio; - bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, btrfs_bioset); + bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset); bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = first_byte >> 9; btrfs_io_bio_init(btrfs_io_bio(bio)); @@ -2692,7 +2689,7 @@ struct bio *btrfs_bio_clone(struct bio *bio) struct bio *new; /* Bio allocation backed by a bioset does not fail */ - new = bio_clone_fast(bio, GFP_NOFS, btrfs_bioset); + new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset); btrfs_bio = btrfs_io_bio(new); btrfs_io_bio_init(btrfs_bio); btrfs_bio->iter = bio->bi_iter; @@ -2704,7 +2701,7 @@ struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs) struct bio *bio; /* Bio allocation backed by a bioset does not fail */ - bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, btrfs_bioset); + bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset); btrfs_io_bio_init(btrfs_io_bio(bio)); return bio; } @@ -2715,7 +2712,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) struct btrfs_io_bio *btrfs_bio; /* this will never fail when it's backed by a bioset */ - bio = bio_clone_fast(orig, GFP_NOFS, btrfs_bioset); + bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset); ASSERT(bio); btrfs_bio = btrfs_io_bio(bio); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0167a9c97c9c..f660ba1e5e58 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1748,7 +1748,7 @@ again: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes, - (ret != 0)); + true); if (ret) { btrfs_drop_pages(pages, num_pages); break; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e064c49c9a9a..0b86cf10cf2a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -31,6 +31,7 @@ #include <linux/uio.h> #include <linux/magic.h> #include <linux/iversion.h> +#include <asm/unaligned.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -1741,12 +1742,12 @@ static void btrfs_add_delalloc_inodes(struct btrfs_root *root, spin_unlock(&root->delalloc_lock); } -static void btrfs_del_delalloc_inode(struct btrfs_root *root, - struct btrfs_inode *inode) + +void __btrfs_del_delalloc_inode(struct btrfs_root *root, + struct btrfs_inode *inode) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); - spin_lock(&root->delalloc_lock); if (!list_empty(&inode->delalloc_inodes)) { list_del_init(&inode->delalloc_inodes); clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, @@ -1759,6 +1760,13 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root, spin_unlock(&fs_info->delalloc_root_lock); } } +} + +static void btrfs_del_delalloc_inode(struct btrfs_root *root, + struct btrfs_inode *inode) +{ + spin_lock(&root->delalloc_lock); + __btrfs_del_delalloc_inode(root, inode); spin_unlock(&root->delalloc_lock); } @@ -5905,11 +5913,13 @@ static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx) struct dir_entry *entry = addr; char *name = (char *)(entry + 1); - ctx->pos = entry->offset; - if (!dir_emit(ctx, name, entry->name_len, entry->ino, - entry->type)) + ctx->pos = get_unaligned(&entry->offset); + if (!dir_emit(ctx, name, get_unaligned(&entry->name_len), + get_unaligned(&entry->ino), + get_unaligned(&entry->type))) return 1; - addr += sizeof(struct dir_entry) + entry->name_len; + addr += sizeof(struct dir_entry) + + get_unaligned(&entry->name_len); ctx->pos++; } return 0; @@ -5999,14 +6009,15 @@ again: } entry = addr; - entry->name_len = name_len; + put_unaligned(name_len, &entry->name_len); name_ptr = (char *)(entry + 1); read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1), name_len); - entry->type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; + put_unaligned(btrfs_filetype_table[btrfs_dir_type(leaf, di)], + &entry->type); btrfs_dir_item_key_to_cpu(leaf, di, &location); - entry->ino = location.objectid; - entry->offset = found_key.offset; + put_unaligned(location.objectid, &entry->ino); + put_unaligned(found_key.offset, &entry->offset); entries++; addr += sizeof(struct dir_entry) + name_len; total_len += sizeof(struct dir_entry) + name_len; @@ -6575,8 +6586,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, goto out_unlock_inode; } else { btrfs_update_inode(trans, root, inode); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); } out_unlock: @@ -6652,8 +6662,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, goto out_unlock_inode; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); out_unlock: btrfs_end_transaction(trans); @@ -6798,12 +6807,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (err) goto out_fail_inode; - d_instantiate(dentry, inode); - /* - * mkdir is special. We're unlocking after we call d_instantiate - * to avoid a race with nfsd calling d_instantiate. - */ - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); drop_on_err = 0; out_fail: @@ -9113,7 +9117,8 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) BTRFS_EXTENT_DATA_KEY); trans->block_rsv = &fs_info->trans_block_rsv; if (ret != -ENOSPC && ret != -EAGAIN) { - err = ret; + if (ret < 0) + err = ret; break; } @@ -10246,8 +10251,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, goto out_unlock_inode; } - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); out_unlock: btrfs_end_transaction(trans); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 124276bba8cf..21a831d3d087 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -189,9 +189,10 @@ void btrfs_print_leaf(struct extent_buffer *l) fs_info = l->fs_info; nr = btrfs_header_nritems(l); - btrfs_info(fs_info, "leaf %llu total ptrs %d free space %d", - btrfs_header_bytenr(l), nr, - btrfs_leaf_free_space(fs_info, l)); + btrfs_info(fs_info, + "leaf %llu gen %llu total ptrs %d free space %d owner %llu", + btrfs_header_bytenr(l), btrfs_header_generation(l), nr, + btrfs_leaf_free_space(fs_info, l), btrfs_header_owner(l)); for (i = 0 ; i < nr ; i++) { item = btrfs_item_nr(i); btrfs_item_key_to_cpu(l, &key, i); @@ -325,7 +326,7 @@ void btrfs_print_leaf(struct extent_buffer *l) } } -void btrfs_print_tree(struct extent_buffer *c) +void btrfs_print_tree(struct extent_buffer *c, bool follow) { struct btrfs_fs_info *fs_info; int i; u32 nr; @@ -342,15 +343,19 @@ void btrfs_print_tree(struct extent_buffer *c) return; } btrfs_info(fs_info, - "node %llu level %d total ptrs %d free spc %u", - btrfs_header_bytenr(c), level, nr, - (u32)BTRFS_NODEPTRS_PER_BLOCK(fs_info) - nr); + "node %llu level %d gen %llu total ptrs %d free spc %u owner %llu", + btrfs_header_bytenr(c), level, btrfs_header_generation(c), + nr, (u32)BTRFS_NODEPTRS_PER_BLOCK(fs_info) - nr, + btrfs_header_owner(c)); for (i = 0; i < nr; i++) { btrfs_node_key_to_cpu(c, &key, i); - pr_info("\tkey %d (%llu %u %llu) block %llu\n", + pr_info("\tkey %d (%llu %u %llu) block %llu gen %llu\n", i, key.objectid, key.type, key.offset, - btrfs_node_blockptr(c, i)); + btrfs_node_blockptr(c, i), + btrfs_node_ptr_generation(c, i)); } + if (!follow) + return; for (i = 0; i < nr; i++) { struct btrfs_key first_key; struct extent_buffer *next; @@ -372,7 +377,7 @@ void btrfs_print_tree(struct extent_buffer *c) if (btrfs_header_level(next) != level - 1) BUG(); - btrfs_print_tree(next); + btrfs_print_tree(next, follow); free_extent_buffer(next); } } diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h index 4a98481688f4..e6bb38fd75ad 100644 --- a/fs/btrfs/print-tree.h +++ b/fs/btrfs/print-tree.h @@ -7,6 +7,6 @@ #define BTRFS_PRINT_TREE_H void btrfs_print_leaf(struct extent_buffer *l); -void btrfs_print_tree(struct extent_buffer *c); +void btrfs_print_tree(struct extent_buffer *c, bool follow); #endif diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 53a8c95828e3..dc6140013ae8 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -380,6 +380,7 @@ static int prop_compression_apply(struct inode *inode, const char *value, size_t len) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int type; if (len == 0) { @@ -390,14 +391,17 @@ static int prop_compression_apply(struct inode *inode, return 0; } - if (!strncmp("lzo", value, 3)) + if (!strncmp("lzo", value, 3)) { type = BTRFS_COMPRESS_LZO; - else if (!strncmp("zlib", value, 4)) + btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); + } else if (!strncmp("zlib", value, 4)) { type = BTRFS_COMPRESS_ZLIB; - else if (!strncmp("zstd", value, len)) + } else if (!strncmp("zstd", value, len)) { type = BTRFS_COMPRESS_ZSTD; - else + btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); + } else { return -EINVAL; + } BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 09c7e4fd550f..9fb758d5077a 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -11,6 +11,7 @@ #include <linux/slab.h> #include <linux/workqueue.h> #include <linux/btrfs.h> +#include <linux/sizes.h> #include "ctree.h" #include "transaction.h" @@ -2375,8 +2376,21 @@ out: return ret; } -static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes) +/* + * Two limits to commit transaction in advance. + * + * For RATIO, it will be 1/RATIO of the remaining limit + * (excluding data and prealloc meta) as threshold. + * For SIZE, it will be in byte unit as threshold. + */ +#define QGROUP_PERTRANS_RATIO 32 +#define QGROUP_PERTRANS_SIZE SZ_32M +static bool qgroup_check_limits(struct btrfs_fs_info *fs_info, + const struct btrfs_qgroup *qg, u64 num_bytes) { + u64 limit; + u64 threshold; + if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer) return false; @@ -2385,6 +2399,31 @@ static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes) qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl) return false; + /* + * Even if we passed the check, it's better to check if reservation + * for meta_pertrans is pushing us near limit. + * If there is too much pertrans reservation or it's near the limit, + * let's try commit transaction to free some, using transaction_kthread + */ + if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER | + BTRFS_QGROUP_LIMIT_MAX_EXCL))) { + if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) + limit = qg->max_excl; + else + limit = qg->max_rfer; + threshold = (limit - qg->rsv.values[BTRFS_QGROUP_RSV_DATA] - + qg->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC]) / + QGROUP_PERTRANS_RATIO; + threshold = min_t(u64, threshold, QGROUP_PERTRANS_SIZE); + + /* + * Use transaction_kthread to commit transaction, so we no + * longer need to bother nested transaction nor lock context. + */ + if (qg->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > threshold) + btrfs_commit_transaction_locksafe(fs_info); + } + return true; } @@ -2434,7 +2473,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, qg = unode_aux_to_qgroup(unode); - if (enforce && !qgroup_check_limits(qg, num_bytes)) { + if (enforce && !qgroup_check_limits(fs_info, qg, num_bytes)) { ret = -EDQUOT; goto out; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 00b7d3231821..b041b945a7ae 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1841,7 +1841,7 @@ again: old_bytenr = btrfs_node_blockptr(parent, slot); blocksize = fs_info->nodesize; old_ptr_gen = btrfs_node_ptr_generation(parent, slot); - btrfs_node_key_to_cpu(parent, &key, slot); + btrfs_node_key_to_cpu(parent, &first_key, slot); if (level <= max_level) { eb = path->nodes[level]; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 221e5cdb060b..c0074d2d7d6d 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5236,6 +5236,10 @@ static int send_write_or_clone(struct send_ctx *sctx, len = btrfs_file_extent_num_bytes(path->nodes[0], ei); } + if (offset >= sctx->cur_inode_size) { + ret = 0; + goto out; + } if (offset + len > sctx->cur_inode_size) len = sctx->cur_inode_size - offset; if (len == 0) { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 63fdcab64b01..c944b4769e3c 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2267,6 +2267,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) */ cur_trans->state = TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); + clear_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags); spin_lock(&fs_info->trans_lock); list_del_init(&cur_trans->list); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index c88fccd80bc5..d8c0826bc2c7 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -199,6 +199,20 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); int btrfs_commit_transaction(struct btrfs_trans_handle *trans); int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, int wait_for_unblock); + +/* + * Try to commit transaction asynchronously, so this is safe to call + * even holding a spinlock. + * + * It's done by informing transaction_kthread to commit transaction without + * waiting for commit interval. + */ +static inline void btrfs_commit_transaction_locksafe( + struct btrfs_fs_info *fs_info) +{ + set_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags); + wake_up_process(fs_info->transaction_kthread); +} int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans); int btrfs_should_end_transaction(struct btrfs_trans_handle *trans); void btrfs_throttle(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 43758e30aa7a..8f23a94dab77 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4320,6 +4320,110 @@ static int log_one_extent(struct btrfs_trans_handle *trans, return ret; } +/* + * Log all prealloc extents beyond the inode's i_size to make sure we do not + * lose them after doing a fast fsync and replaying the log. We scan the + * subvolume's root instead of iterating the inode's extent map tree because + * otherwise we can log incorrect extent items based on extent map conversion. + * That can happen due to the fact that extent maps are merged when they + * are not in the extent map tree's list of modified extents. + */ +static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path) +{ + struct btrfs_root *root = inode->root; + struct btrfs_key key; + const u64 i_size = i_size_read(&inode->vfs_inode); + const u64 ino = btrfs_ino(inode); + struct btrfs_path *dst_path = NULL; + u64 last_extent = (u64)-1; + int ins_nr = 0; + int start_slot; + int ret; + + if (!(inode->flags & BTRFS_INODE_PREALLOC)) + return 0; + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = i_size; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + while (true) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + + if (slot >= btrfs_header_nritems(leaf)) { + if (ins_nr > 0) { + ret = copy_items(trans, inode, dst_path, path, + &last_extent, start_slot, + ins_nr, 1, 0); + if (ret < 0) + goto out; + ins_nr = 0; + } + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + if (ret > 0) { + ret = 0; + break; + } + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid > ino) + break; + if (WARN_ON_ONCE(key.objectid < ino) || + key.type < BTRFS_EXTENT_DATA_KEY || + key.offset < i_size) { + path->slots[0]++; + continue; + } + if (last_extent == (u64)-1) { + last_extent = key.offset; + /* + * Avoid logging extent items logged in past fsync calls + * and leading to duplicate keys in the log tree. + */ + do { + ret = btrfs_truncate_inode_items(trans, + root->log_root, + &inode->vfs_inode, + i_size, + BTRFS_EXTENT_DATA_KEY); + } while (ret == -EAGAIN); + if (ret) + goto out; + } + if (ins_nr == 0) + start_slot = slot; + ins_nr++; + path->slots[0]++; + if (!dst_path) { + dst_path = btrfs_alloc_path(); + if (!dst_path) { + ret = -ENOMEM; + goto out; + } + } + } + if (ins_nr > 0) { + ret = copy_items(trans, inode, dst_path, path, &last_extent, + start_slot, ins_nr, 1, 0); + if (ret > 0) + ret = 0; + } +out: + btrfs_release_path(path); + btrfs_free_path(dst_path); + return ret; +} + static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, @@ -4362,6 +4466,11 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, if (em->generation <= test_gen) continue; + /* We log prealloc extents beyond eof later. */ + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && + em->start >= i_size_read(&inode->vfs_inode)) + continue; + if (em->start < logged_start) logged_start = em->start; if ((em->start + em->len - 1) > logged_end) @@ -4374,31 +4483,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, num++; } - /* - * Add all prealloc extents beyond the inode's i_size to make sure we - * don't lose them after doing a fast fsync and replaying the log. - */ - if (inode->flags & BTRFS_INODE_PREALLOC) { - struct rb_node *node; - - for (node = rb_last(&tree->map); node; node = rb_prev(node)) { - em = rb_entry(node, struct extent_map, rb_node); - if (em->start < i_size_read(&inode->vfs_inode)) - break; - if (!list_empty(&em->list)) - continue; - /* Same as above loop. */ - if (++num > 32768) { - list_del_init(&tree->modified_extents); - ret = -EFBIG; - goto process; - } - refcount_inc(&em->refs); - set_bit(EXTENT_FLAG_LOGGING, &em->flags); - list_add_tail(&em->list, &extents); - } - } - list_sort(NULL, &extents, extent_cmp); btrfs_get_logged_extents(inode, logged_list, logged_start, logged_end); /* @@ -4443,6 +4527,9 @@ process: up_write(&inode->dio_sem); btrfs_release_path(path); + if (!ret) + ret = btrfs_log_prealloc_extents(trans, inode, path); + return ret; } @@ -4827,6 +4914,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct extent_map_tree *em_tree = &inode->extent_tree; u64 logged_isize = 0; bool need_log_inode_item = true; + bool xattrs_logged = false; path = btrfs_alloc_path(); if (!path) @@ -5128,6 +5216,7 @@ next_key: err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); if (err) goto out_unlock; + xattrs_logged = true; if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { btrfs_release_path(path); btrfs_release_path(dst_path); @@ -5140,6 +5229,11 @@ log_extents: btrfs_release_path(dst_path); if (need_log_inode_item) { err = log_inode_item(trans, log, dst_path, inode); + if (!err && !xattrs_logged) { + err = btrfs_log_all_xattrs(trans, root, inode, path, + dst_path); + btrfs_release_path(path); + } if (err) goto out_unlock; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 292266f6ab9c..be3fc701f389 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4052,6 +4052,15 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) return 0; } + /* + * A ro->rw remount sequence should continue with the paused balance + * regardless of who pauses it, system or the user as of now, so set + * the resume flag. + */ + spin_lock(&fs_info->balance_lock); + fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME; + spin_unlock(&fs_info->balance_lock); + tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); return PTR_ERR_OR_ZERO(tsk); } diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 0daa1e3fe0df..ab0bbe93b398 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -572,6 +572,11 @@ lookup_again: if (ret < 0) goto create_error; + if (unlikely(d_unhashed(next))) { + dput(next); + inode_unlock(d_inode(dir)); + goto lookup_again; + } ASSERT(d_backing_inode(next)); _debug("mkdir -> %p{%p{ino=%lu}}", @@ -764,6 +769,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, /* search the current directory for the element name */ inode_lock(d_inode(dir)); +retry: start = jiffies; subdir = lookup_one_len(dirname, dir, strlen(dirname)); cachefiles_hist(cachefiles_lookup_histogram, start); @@ -793,6 +799,10 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, if (ret < 0) goto mkdir_error; + if (unlikely(d_unhashed(subdir))) { + dput(subdir); + goto retry; + } ASSERT(d_backing_inode(subdir)); _debug("mkdir -> %p{%p{ino=%lu}}", diff --git a/fs/cachefiles/proc.c b/fs/cachefiles/proc.c index 125b90f6c796..0ce1aa56b67f 100644 --- a/fs/cachefiles/proc.c +++ b/fs/cachefiles/proc.c @@ -85,21 +85,6 @@ static const struct seq_operations cachefiles_histogram_ops = { }; /* - * open "/proc/fs/cachefiles/XXX" which provide statistics summaries - */ -static int cachefiles_histogram_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &cachefiles_histogram_ops); -} - -static const struct file_operations cachefiles_histogram_fops = { - .open = cachefiles_histogram_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -/* * initialise the /proc/fs/cachefiles/ directory */ int __init cachefiles_proc_init(void) @@ -109,8 +94,8 @@ int __init cachefiles_proc_init(void) if (!proc_mkdir("fs/cachefiles", NULL)) goto error_dir; - if (!proc_create("fs/cachefiles/histogram", S_IFREG | 0444, NULL, - &cachefiles_histogram_fops)) + if (!proc_create_seq("fs/cachefiles/histogram", S_IFREG | 0444, NULL, + &cachefiles_histogram_ops)) goto error_histogram; _leave(" = 0"); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index f85040d73e3d..cf0e45b10121 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -70,69 +70,104 @@ static __le32 ceph_flags_sys2wire(u32 flags) */ /* - * Calculate the length sum of direct io vectors that can - * be combined into one page vector. + * How many pages to get in one call to iov_iter_get_pages(). This + * determines the size of the on-stack array used as a buffer. */ -static size_t dio_get_pagev_size(const struct iov_iter *it) +#define ITER_GET_BVECS_PAGES 64 + +static ssize_t __iter_get_bvecs(struct iov_iter *iter, size_t maxsize, + struct bio_vec *bvecs) { - const struct iovec *iov = it->iov; - const struct iovec *iovend = iov + it->nr_segs; - size_t size; - - size = iov->iov_len - it->iov_offset; - /* - * An iov can be page vectored when both the current tail - * and the next base are page aligned. - */ - while (PAGE_ALIGNED((iov->iov_base + iov->iov_len)) && - (++iov < iovend && PAGE_ALIGNED((iov->iov_base)))) { - size += iov->iov_len; - } - dout("dio_get_pagevlen len = %zu\n", size); - return size; + size_t size = 0; + int bvec_idx = 0; + + if (maxsize > iov_iter_count(iter)) + maxsize = iov_iter_count(iter); + + while (size < maxsize) { + struct page *pages[ITER_GET_BVECS_PAGES]; + ssize_t bytes; + size_t start; + int idx = 0; + + bytes = iov_iter_get_pages(iter, pages, maxsize - size, + ITER_GET_BVECS_PAGES, &start); + if (bytes < 0) + return size ?: bytes; + + iov_iter_advance(iter, bytes); + size += bytes; + + for ( ; bytes; idx++, bvec_idx++) { + struct bio_vec bv = { + .bv_page = pages[idx], + .bv_len = min_t(int, bytes, PAGE_SIZE - start), + .bv_offset = start, + }; + + bvecs[bvec_idx] = bv; + bytes -= bv.bv_len; + start = 0; + } + } + + return size; } /* - * Allocate a page vector based on (@it, @nbytes). - * The return value is the tuple describing a page vector, - * that is (@pages, @page_align, @num_pages). + * iov_iter_get_pages() only considers one iov_iter segment, no matter + * what maxsize or maxpages are given. For ITER_BVEC that is a single + * page. + * + * Attempt to get up to @maxsize bytes worth of pages from @iter. + * Return the number of bytes in the created bio_vec array, or an error. */ -static struct page ** -dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes, - size_t *page_align, int *num_pages) +static ssize_t iter_get_bvecs_alloc(struct iov_iter *iter, size_t maxsize, + struct bio_vec **bvecs, int *num_bvecs) { - struct iov_iter tmp_it = *it; - size_t align; - struct page **pages; - int ret = 0, idx, npages; + struct bio_vec *bv; + size_t orig_count = iov_iter_count(iter); + ssize_t bytes; + int npages; - align = (unsigned long)(it->iov->iov_base + it->iov_offset) & - (PAGE_SIZE - 1); - npages = calc_pages_for(align, nbytes); - pages = kvmalloc(sizeof(*pages) * npages, GFP_KERNEL); - if (!pages) - return ERR_PTR(-ENOMEM); + iov_iter_truncate(iter, maxsize); + npages = iov_iter_npages(iter, INT_MAX); + iov_iter_reexpand(iter, orig_count); - for (idx = 0; idx < npages; ) { - size_t start; - ret = iov_iter_get_pages(&tmp_it, pages + idx, nbytes, - npages - idx, &start); - if (ret < 0) - goto fail; + /* + * __iter_get_bvecs() may populate only part of the array -- zero it + * out. + */ + bv = kvmalloc_array(npages, sizeof(*bv), GFP_KERNEL | __GFP_ZERO); + if (!bv) + return -ENOMEM; - iov_iter_advance(&tmp_it, ret); - nbytes -= ret; - idx += (ret + start + PAGE_SIZE - 1) / PAGE_SIZE; + bytes = __iter_get_bvecs(iter, maxsize, bv); + if (bytes < 0) { + /* + * No pages were pinned -- just free the array. + */ + kvfree(bv); + return bytes; } - BUG_ON(nbytes != 0); - *num_pages = npages; - *page_align = align; - dout("dio_get_pages_alloc: got %d pages align %zu\n", npages, align); - return pages; -fail: - ceph_put_page_vector(pages, idx, false); - return ERR_PTR(ret); + *bvecs = bv; + *num_bvecs = npages; + return bytes; +} + +static void put_bvecs(struct bio_vec *bvecs, int num_bvecs, bool should_dirty) +{ + int i; + + for (i = 0; i < num_bvecs; i++) { + if (bvecs[i].bv_page) { + if (should_dirty) + set_page_dirty_lock(bvecs[i].bv_page); + put_page(bvecs[i].bv_page); + } + } + kvfree(bvecs); } /* @@ -746,11 +781,12 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) struct inode *inode = req->r_inode; struct ceph_aio_request *aio_req = req->r_priv; struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); - int num_pages = calc_pages_for((u64)osd_data->alignment, - osd_data->length); - dout("ceph_aio_complete_req %p rc %d bytes %llu\n", - inode, rc, osd_data->length); + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS); + BUG_ON(!osd_data->num_bvecs); + + dout("ceph_aio_complete_req %p rc %d bytes %u\n", + inode, rc, osd_data->bvec_pos.iter.bi_size); if (rc == -EOLDSNAPC) { struct ceph_aio_work *aio_work; @@ -768,9 +804,10 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) } else if (!aio_req->write) { if (rc == -ENOENT) rc = 0; - if (rc >= 0 && osd_data->length > rc) { - int zoff = osd_data->alignment + rc; - int zlen = osd_data->length - rc; + if (rc >= 0 && osd_data->bvec_pos.iter.bi_size > rc) { + struct iov_iter i; + int zlen = osd_data->bvec_pos.iter.bi_size - rc; + /* * If read is satisfied by single OSD request, * it can pass EOF. Otherwise read is within @@ -785,13 +822,16 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) aio_req->total_len = rc + zlen; } - if (zlen > 0) - ceph_zero_page_vector_range(zoff, zlen, - osd_data->pages); + iov_iter_bvec(&i, ITER_BVEC, osd_data->bvec_pos.bvecs, + osd_data->num_bvecs, + osd_data->bvec_pos.iter.bi_size); + iov_iter_advance(&i, rc); + iov_iter_zero(zlen, &i); } } - ceph_put_page_vector(osd_data->pages, num_pages, aio_req->should_dirty); + put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs, + aio_req->should_dirty); ceph_osdc_put_request(req); if (rc < 0) @@ -879,7 +919,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_vino vino; struct ceph_osd_request *req; - struct page **pages; + struct bio_vec *bvecs; struct ceph_aio_request *aio_req = NULL; int num_pages = 0; int flags; @@ -914,10 +954,14 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, } while (iov_iter_count(iter) > 0) { - u64 size = dio_get_pagev_size(iter); - size_t start = 0; + u64 size = iov_iter_count(iter); ssize_t len; + if (write) + size = min_t(u64, size, fsc->mount_options->wsize); + else + size = min_t(u64, size, fsc->mount_options->rsize); + vino = ceph_vino(inode); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, pos, &size, 0, @@ -933,18 +977,14 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, break; } - if (write) - size = min_t(u64, size, fsc->mount_options->wsize); - else - size = min_t(u64, size, fsc->mount_options->rsize); - - len = size; - pages = dio_get_pages_alloc(iter, len, &start, &num_pages); - if (IS_ERR(pages)) { + len = iter_get_bvecs_alloc(iter, size, &bvecs, &num_pages); + if (len < 0) { ceph_osdc_put_request(req); - ret = PTR_ERR(pages); + ret = len; break; } + if (len != size) + osd_req_op_extent_update(req, 0, len); /* * To simplify error handling, allow AIO when IO within i_size @@ -977,8 +1017,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, req->r_mtime = mtime; } - osd_req_op_extent_osd_data_pages(req, 0, pages, len, start, - false, false); + osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len); if (aio_req) { aio_req->total_len += len; @@ -991,7 +1030,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, list_add_tail(&req->r_unsafe_item, &aio_req->osd_reqs); pos += len; - iov_iter_advance(iter, len); continue; } @@ -1004,25 +1042,26 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, if (ret == -ENOENT) ret = 0; if (ret >= 0 && ret < len && pos + ret < size) { + struct iov_iter i; int zlen = min_t(size_t, len - ret, size - pos - ret); - ceph_zero_page_vector_range(start + ret, zlen, - pages); + + iov_iter_bvec(&i, ITER_BVEC, bvecs, num_pages, + len); + iov_iter_advance(&i, ret); + iov_iter_zero(zlen, &i); ret += zlen; } if (ret >= 0) len = ret; } - ceph_put_page_vector(pages, num_pages, should_dirty); - + put_bvecs(bvecs, num_pages, should_dirty); ceph_osdc_put_request(req); if (ret < 0) break; pos += len; - iov_iter_advance(iter, len); - if (!write && pos >= size) break; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 8bf60250309e..ae056927080d 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -669,13 +669,15 @@ void ceph_fill_file_time(struct inode *inode, int issued, CEPH_CAP_FILE_BUFFER| CEPH_CAP_AUTH_EXCL| CEPH_CAP_XATTR_EXCL)) { - if (timespec_compare(ctime, &inode->i_ctime) > 0) { + if (ci->i_version == 0 || + timespec_compare(ctime, &inode->i_ctime) > 0) { dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n", inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, ctime->tv_sec, ctime->tv_nsec); inode->i_ctime = *ctime; } - if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) { + if (ci->i_version == 0 || + ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) { /* the MDS did a utimes() */ dout("mtime %ld.%09ld -> %ld.%09ld " "tw %d -> %d\n", @@ -795,7 +797,6 @@ static int fill_inode(struct inode *inode, struct page *locked_page, new_issued = ~issued & le32_to_cpu(info->cap.caps); /* update inode */ - ci->i_version = le64_to_cpu(info->version); inode->i_rdev = le32_to_cpu(info->rdev); inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; @@ -868,6 +869,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page, xattr_blob = NULL; } + /* finally update i_version */ + ci->i_version = le64_to_cpu(info->version); + inode->i_mapping->a_ops = &ceph_aops; switch (inode->i_mode & S_IFMT) { diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 7e72348639e4..315f7e63e7cc 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -228,7 +228,15 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val, static bool ceph_vxattrcb_quota_exists(struct ceph_inode_info *ci) { - return (ci->i_max_files || ci->i_max_bytes); + bool ret = false; + spin_lock(&ci->i_ceph_lock); + if ((ci->i_max_files || ci->i_max_bytes) && + ci->i_vino.snap == CEPH_NOSNAP && + ci->i_snap_realm && + ci->i_snap_realm->ino == ci->i_vino.ino) + ret = true; + spin_unlock(&ci->i_ceph_lock); + return ret; } static size_t ceph_vxattrcb_quota(struct ceph_inode_info *ci, char *val, @@ -1008,14 +1016,19 @@ int __ceph_setxattr(struct inode *inode, const char *name, char *newval = NULL; struct ceph_inode_xattr *xattr = NULL; int required_blob_size; + bool check_realm = false; bool lock_snap_rwsem = false; if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; vxattr = ceph_match_vxattr(inode, name); - if (vxattr && vxattr->readonly) - return -EOPNOTSUPP; + if (vxattr) { + if (vxattr->readonly) + return -EOPNOTSUPP; + if (value && !strncmp(vxattr->name, "ceph.quota", 10)) + check_realm = true; + } /* pass any unhandled ceph.* xattrs through to the MDS */ if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) @@ -1109,6 +1122,15 @@ do_sync_unlocked: err = -EBUSY; } else { err = ceph_sync_setxattr(inode, name, value, size, flags); + if (err >= 0 && check_realm) { + /* check if snaprealm was created for quota inode */ + spin_lock(&ci->i_ceph_lock); + if ((ci->i_max_files || ci->i_max_bytes) && + !(ci->i_snap_realm && + ci->i_snap_realm->ino == ci->i_vino.ino)) + err = -EOPNOTSUPP; + spin_unlock(&ci->i_ceph_lock); + } } out: ceph_free_cap_flush(prealloc_cf); diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 741749a98614..5f132d59dfc2 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -197,7 +197,7 @@ config CIFS_SMB311 config CIFS_SMB_DIRECT bool "SMB Direct support (Experimental)" - depends on CIFS=m && INFINIBAND || CIFS=y && INFINIBAND=y + depends on CIFS=m && INFINIBAND && INFINIBAND_ADDR_TRANS || CIFS=y && INFINIBAND=y && INFINIBAND_ADDR_TRANS=y help Enables SMB Direct experimental support for SMB 3.0, 3.02 and 3.1.1. SMB Direct allows transferring SMB packets over RDMA. If unsure, diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 9d69ea433330..4bc4a7ac61d9 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -314,18 +314,6 @@ skip_rdma: return 0; } -static int cifs_debug_data_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, cifs_debug_data_proc_show, NULL); -} - -static const struct file_operations cifs_debug_data_proc_fops = { - .open = cifs_debug_data_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - #ifdef CONFIG_CIFS_STATS static ssize_t cifs_stats_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) @@ -497,7 +485,8 @@ cifs_proc_init(void) if (proc_fs_cifs == NULL) return; - proc_create("DebugData", 0, proc_fs_cifs, &cifs_debug_data_proc_fops); + proc_create_single("DebugData", 0, proc_fs_cifs, + cifs_debug_data_proc_show); #ifdef CONFIG_CIFS_STATS proc_create("Stats", 0, proc_fs_cifs, &cifs_stats_proc_fops); diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h index fe5567655662..0e74690d11bc 100644 --- a/fs/cifs/cifs_debug.h +++ b/fs/cifs/cifs_debug.h @@ -54,7 +54,7 @@ do { \ pr_debug_ ## ratefunc("%s: " \ fmt, __FILE__, ##__VA_ARGS__); \ } else if ((type) & VFS) { \ - pr_err_ ## ratefunc("CuIFS VFS: " \ + pr_err_ ## ratefunc("CIFS VFS: " \ fmt, ##__VA_ARGS__); \ } else if ((type) & NOISY && (NOISY != 0)) { \ pr_debug_ ## ratefunc(fmt, ##__VA_ARGS__); \ diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index f715609b13f3..5a5a0158cc8f 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -1047,6 +1047,18 @@ out: return rc; } +/* + * Directory operations under CIFS/SMB2/SMB3 are synchronous, so fsync() + * is a dummy operation. + */ +static int cifs_dir_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + cifs_dbg(FYI, "Sync directory - name: %pD datasync: 0x%x\n", + file, datasync); + + return 0; +} + static ssize_t cifs_copy_file_range(struct file *src_file, loff_t off, struct file *dst_file, loff_t destoff, size_t len, unsigned int flags) @@ -1181,6 +1193,7 @@ const struct file_operations cifs_dir_ops = { .copy_file_range = cifs_copy_file_range, .clone_file_range = cifs_clone_file_range, .llseek = generic_file_llseek, + .fsync = cifs_dir_fsync, }; static void diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 6d3e40d7029c..1529a088383d 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -455,6 +455,9 @@ cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required) server->sign = true; } + if (cifs_rdma_enabled(server) && server->sign) + cifs_dbg(VFS, "Signing is enabled, and RDMA read/write will be disabled"); + return 0; } diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index e8830f076a7f..7a10a5d0731f 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1977,14 +1977,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, goto cifs_parse_mount_err; } -#ifdef CONFIG_CIFS_SMB_DIRECT - if (vol->rdma && vol->sign) { - cifs_dbg(VFS, "Currently SMB direct doesn't support signing." - " This is being fixed\n"); - goto cifs_parse_mount_err; - } -#endif - #ifndef CONFIG_KEYS /* Muliuser mounts require CONFIG_KEYS support */ if (vol->multiuser) { @@ -2959,6 +2951,22 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) } } + if (volume_info->seal) { + if (ses->server->vals->protocol_id == 0) { + cifs_dbg(VFS, + "SMB3 or later required for encryption\n"); + rc = -EOPNOTSUPP; + goto out_fail; + } else if (tcon->ses->server->capabilities & + SMB2_GLOBAL_CAP_ENCRYPTION) + tcon->seal = true; + else { + cifs_dbg(VFS, "Encryption is not supported on share\n"); + rc = -EOPNOTSUPP; + goto out_fail; + } + } + /* * BB Do we need to wrap session_mutex around this TCon call and Unix * SetFS as we do on SessSetup and reconnect? @@ -3007,22 +3015,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) tcon->use_resilient = true; } - if (volume_info->seal) { - if (ses->server->vals->protocol_id == 0) { - cifs_dbg(VFS, - "SMB3 or later required for encryption\n"); - rc = -EOPNOTSUPP; - goto out_fail; - } else if (tcon->ses->server->capabilities & - SMB2_GLOBAL_CAP_ENCRYPTION) - tcon->seal = true; - else { - cifs_dbg(VFS, "Encryption is not supported on share\n"); - rc = -EOPNOTSUPP; - goto out_fail; - } - } - /* * We can have only one retry value for a connection to a share so for * resources mounted more than once to the same server share the last diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 81ba6e0d88d8..925844343038 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -684,6 +684,9 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, goto mknod_out; } + if (!S_ISCHR(mode) && !S_ISBLK(mode)) + goto mknod_out; + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) goto mknod_out; @@ -692,10 +695,8 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); if (buf == NULL) { - kfree(full_path); rc = -ENOMEM; - free_xid(xid); - return rc; + goto mknod_out; } if (backup_cred(cifs_sb)) @@ -742,7 +743,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, pdev->minor = cpu_to_le64(MINOR(device_number)); rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, &bytes_written, iov, 1); - } /* else if (S_ISFIFO) */ + } tcon->ses->server->ops->close(xid, tcon, &fid); d_drop(direntry); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 4bcd4e838b47..23fd430fe74a 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3462,7 +3462,7 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset) * If the page is mmap'ed into a process' page tables, then we need to make * sure that it doesn't change while being written back. */ -static int +static vm_fault_t cifs_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index b4ae932ea134..9c6d95ffca97 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -252,9 +252,14 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE; wsize = min_t(unsigned int, wsize, server->max_write); #ifdef CONFIG_CIFS_SMB_DIRECT - if (server->rdma) - wsize = min_t(unsigned int, + if (server->rdma) { + if (server->sign) + wsize = min_t(unsigned int, + wsize, server->smbd_conn->max_fragmented_send_size); + else + wsize = min_t(unsigned int, wsize, server->smbd_conn->max_readwrite_size); + } #endif if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE); @@ -272,9 +277,14 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE; rsize = min_t(unsigned int, rsize, server->max_read); #ifdef CONFIG_CIFS_SMB_DIRECT - if (server->rdma) - rsize = min_t(unsigned int, + if (server->rdma) { + if (server->sign) + rsize = min_t(unsigned int, + rsize, server->smbd_conn->max_fragmented_recv_size); + else + rsize = min_t(unsigned int, rsize, server->smbd_conn->max_readwrite_size); + } #endif if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) @@ -579,9 +589,15 @@ smb2_query_eas(const unsigned int xid, struct cifs_tcon *tcon, SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); + /* + * If ea_name is NULL (listxattr) and there are no EAs, return 0 as it's + * not an error. Otherwise, the specified ea_name was not found. + */ if (!rc) rc = move_smb2_ea_to_cifs(ea_data, buf_size, smb2_data, SMB2_MAX_EA_BUF, ea_name); + else if (!ea_name && rc == -ENODATA) + rc = 0; kfree(smb2_data); return rc; @@ -1452,7 +1468,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct cifs_fid fid; struct kvec err_iov = {NULL, 0}; - struct smb2_err_rsp *err_buf = NULL; + struct smb2_err_rsp *err_buf; struct smb2_symlink_err_rsp *symlink; unsigned int sub_len; unsigned int sub_offset; @@ -1476,7 +1492,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, &err_iov); - if (!rc || !err_buf) { + if (!rc || !err_iov.iov_base) { kfree(utf16_path); return -ENOENT; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 0f044c4a2dc9..0f48741a0130 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -383,10 +383,10 @@ static void build_encrypt_ctxt(struct smb2_encryption_neg_context *pneg_ctxt) { pneg_ctxt->ContextType = SMB2_ENCRYPTION_CAPABILITIES; - pneg_ctxt->DataLength = cpu_to_le16(6); - pneg_ctxt->CipherCount = cpu_to_le16(2); - pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_GCM; - pneg_ctxt->Ciphers[1] = SMB2_ENCRYPTION_AES128_CCM; + pneg_ctxt->DataLength = cpu_to_le16(4); /* Cipher Count + le16 cipher */ + pneg_ctxt->CipherCount = cpu_to_le16(1); +/* pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_GCM;*/ /* not supported yet */ + pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_CCM; } static void @@ -444,6 +444,7 @@ static int decode_encrypt_ctx(struct TCP_Server_Info *server, return -EINVAL; } server->cipher_type = ctxt->Ciphers[0]; + server->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION; return 0; } @@ -729,19 +730,14 @@ neg_exit: int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) { - int rc = 0; - struct validate_negotiate_info_req vneg_inbuf; + int rc; + struct validate_negotiate_info_req *pneg_inbuf; struct validate_negotiate_info_rsp *pneg_rsp = NULL; u32 rsplen; u32 inbuflen; /* max of 4 dialects */ cifs_dbg(FYI, "validate negotiate\n"); -#ifdef CONFIG_CIFS_SMB_DIRECT - if (tcon->ses->server->rdma) - return 0; -#endif - /* In SMB3.11 preauth integrity supersedes validate negotiate */ if (tcon->ses->server->dialect == SMB311_PROT_ID) return 0; @@ -764,63 +760,69 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) if (tcon->ses->session_flags & SMB2_SESSION_FLAG_IS_NULL) cifs_dbg(VFS, "Unexpected null user (anonymous) auth flag sent by server\n"); - vneg_inbuf.Capabilities = + pneg_inbuf = kmalloc(sizeof(*pneg_inbuf), GFP_NOFS); + if (!pneg_inbuf) + return -ENOMEM; + + pneg_inbuf->Capabilities = cpu_to_le32(tcon->ses->server->vals->req_capabilities); - memcpy(vneg_inbuf.Guid, tcon->ses->server->client_guid, + memcpy(pneg_inbuf->Guid, tcon->ses->server->client_guid, SMB2_CLIENT_GUID_SIZE); if (tcon->ses->sign) - vneg_inbuf.SecurityMode = + pneg_inbuf->SecurityMode = cpu_to_le16(SMB2_NEGOTIATE_SIGNING_REQUIRED); else if (global_secflags & CIFSSEC_MAY_SIGN) - vneg_inbuf.SecurityMode = + pneg_inbuf->SecurityMode = cpu_to_le16(SMB2_NEGOTIATE_SIGNING_ENABLED); else - vneg_inbuf.SecurityMode = 0; + pneg_inbuf->SecurityMode = 0; if (strcmp(tcon->ses->server->vals->version_string, SMB3ANY_VERSION_STRING) == 0) { - vneg_inbuf.Dialects[0] = cpu_to_le16(SMB30_PROT_ID); - vneg_inbuf.Dialects[1] = cpu_to_le16(SMB302_PROT_ID); - vneg_inbuf.DialectCount = cpu_to_le16(2); + pneg_inbuf->Dialects[0] = cpu_to_le16(SMB30_PROT_ID); + pneg_inbuf->Dialects[1] = cpu_to_le16(SMB302_PROT_ID); + pneg_inbuf->DialectCount = cpu_to_le16(2); /* structure is big enough for 3 dialects, sending only 2 */ - inbuflen = sizeof(struct validate_negotiate_info_req) - 2; + inbuflen = sizeof(*pneg_inbuf) - + sizeof(pneg_inbuf->Dialects[0]); } else if (strcmp(tcon->ses->server->vals->version_string, SMBDEFAULT_VERSION_STRING) == 0) { - vneg_inbuf.Dialects[0] = cpu_to_le16(SMB21_PROT_ID); - vneg_inbuf.Dialects[1] = cpu_to_le16(SMB30_PROT_ID); - vneg_inbuf.Dialects[2] = cpu_to_le16(SMB302_PROT_ID); - vneg_inbuf.DialectCount = cpu_to_le16(3); + pneg_inbuf->Dialects[0] = cpu_to_le16(SMB21_PROT_ID); + pneg_inbuf->Dialects[1] = cpu_to_le16(SMB30_PROT_ID); + pneg_inbuf->Dialects[2] = cpu_to_le16(SMB302_PROT_ID); + pneg_inbuf->DialectCount = cpu_to_le16(3); /* structure is big enough for 3 dialects */ - inbuflen = sizeof(struct validate_negotiate_info_req); + inbuflen = sizeof(*pneg_inbuf); } else { /* otherwise specific dialect was requested */ - vneg_inbuf.Dialects[0] = + pneg_inbuf->Dialects[0] = cpu_to_le16(tcon->ses->server->vals->protocol_id); - vneg_inbuf.DialectCount = cpu_to_le16(1); + pneg_inbuf->DialectCount = cpu_to_le16(1); /* structure is big enough for 3 dialects, sending only 1 */ - inbuflen = sizeof(struct validate_negotiate_info_req) - 4; + inbuflen = sizeof(*pneg_inbuf) - + sizeof(pneg_inbuf->Dialects[0]) * 2; } rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */, - (char *)&vneg_inbuf, sizeof(struct validate_negotiate_info_req), - (char **)&pneg_rsp, &rsplen); + (char *)pneg_inbuf, inbuflen, (char **)&pneg_rsp, &rsplen); if (rc != 0) { cifs_dbg(VFS, "validate protocol negotiate failed: %d\n", rc); - return -EIO; + rc = -EIO; + goto out_free_inbuf; } - if (rsplen != sizeof(struct validate_negotiate_info_rsp)) { + rc = -EIO; + if (rsplen != sizeof(*pneg_rsp)) { cifs_dbg(VFS, "invalid protocol negotiate response size: %d\n", rsplen); /* relax check since Mac returns max bufsize allowed on ioctl */ - if ((rsplen > CIFSMaxBufSize) - || (rsplen < sizeof(struct validate_negotiate_info_rsp))) - goto err_rsp_free; + if (rsplen > CIFSMaxBufSize || rsplen < sizeof(*pneg_rsp)) + goto out_free_rsp; } /* check validate negotiate info response matches what we got earlier */ @@ -837,15 +839,17 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) goto vneg_out; /* validate negotiate successful */ + rc = 0; cifs_dbg(FYI, "validate negotiate info successful\n"); - kfree(pneg_rsp); - return 0; + goto out_free_rsp; vneg_out: cifs_dbg(VFS, "protocol revalidation - security settings mismatch\n"); -err_rsp_free: +out_free_rsp: kfree(pneg_rsp); - return -EIO; +out_free_inbuf: + kfree(pneg_inbuf); + return rc; } enum securityEnum @@ -2590,7 +2594,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len, * If we want to do a RDMA write, fill in and append * smbd_buffer_descriptor_v1 to the end of read request */ - if (server->rdma && rdata && + if (server->rdma && rdata && !server->sign && rdata->bytes >= server->smbd_conn->rdma_readwrite_threshold) { struct smbd_buffer_descriptor_v1 *v1; @@ -2968,7 +2972,7 @@ smb2_async_writev(struct cifs_writedata *wdata, * If we want to do a server RDMA read, fill in and append * smbd_buffer_descriptor_v1 to the end of write request */ - if (server->rdma && wdata->bytes >= + if (server->rdma && !server->sign && wdata->bytes >= server->smbd_conn->rdma_readwrite_threshold) { struct smbd_buffer_descriptor_v1 *v1; diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 6093e5142b2b..d28f358022c5 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -297,7 +297,7 @@ struct smb2_encryption_neg_context { __le16 DataLength; __le32 Reserved; __le16 CipherCount; /* AES-128-GCM and AES-128-CCM */ - __le16 Ciphers[2]; /* Ciphers[0] since only one used now */ + __le16 Ciphers[1]; /* Ciphers[0] since only one used now */ } __packed; struct smb2_negotiate_rsp { diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index 5008af546dd1..c62f7c95683c 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -1028,7 +1028,7 @@ static int smbd_post_send(struct smbd_connection *info, for (i = 0; i < request->num_sge; i++) { log_rdma_send(INFO, "rdma_request sge[%d] addr=%llu length=%u\n", - i, request->sge[0].addr, request->sge[0].length); + i, request->sge[i].addr, request->sge[i].length); ib_dma_sync_single_for_device( info->id->device, request->sge[i].addr, @@ -2086,7 +2086,7 @@ int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst) int start, i, j; int max_iov_size = info->max_send_size - sizeof(struct smbd_data_transfer); - struct kvec iov[SMBDIRECT_MAX_SGE]; + struct kvec *iov; int rc; info->smbd_send_pending++; @@ -2096,32 +2096,20 @@ int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst) } /* - * This usually means a configuration error - * We use RDMA read/write for packet size > rdma_readwrite_threshold - * as long as it's properly configured we should never get into this - * situation - */ - if (rqst->rq_nvec + rqst->rq_npages > SMBDIRECT_MAX_SGE) { - log_write(ERR, "maximum send segment %x exceeding %x\n", - rqst->rq_nvec + rqst->rq_npages, SMBDIRECT_MAX_SGE); - rc = -EINVAL; - goto done; - } - - /* - * Remove the RFC1002 length defined in MS-SMB2 section 2.1 - * It is used only for TCP transport + * Skip the RFC1002 length defined in MS-SMB2 section 2.1 + * It is used only for TCP transport in the iov[0] * In future we may want to add a transport layer under protocol * layer so this will only be issued to TCP transport */ - iov[0].iov_base = (char *)rqst->rq_iov[0].iov_base + 4; - iov[0].iov_len = rqst->rq_iov[0].iov_len - 4; - buflen += iov[0].iov_len; + + if (rqst->rq_iov[0].iov_len != 4) { + log_write(ERR, "expected the pdu length in 1st iov, but got %zu\n", rqst->rq_iov[0].iov_len); + return -EINVAL; + } + iov = &rqst->rq_iov[1]; /* total up iov array first */ - for (i = 1; i < rqst->rq_nvec; i++) { - iov[i].iov_base = rqst->rq_iov[i].iov_base; - iov[i].iov_len = rqst->rq_iov[i].iov_len; + for (i = 0; i < rqst->rq_nvec-1; i++) { buflen += iov[i].iov_len; } @@ -2139,6 +2127,10 @@ int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst) goto done; } + cifs_dbg(FYI, "Sending smb (RDMA): smb_len=%u\n", buflen); + for (i = 0; i < rqst->rq_nvec-1; i++) + dump_smb(iov[i].iov_base, iov[i].iov_len); + remaining_data_length = buflen; log_write(INFO, "rqst->rq_nvec=%d rqst->rq_npages=%d rq_pagesz=%d " @@ -2194,12 +2186,14 @@ int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst) goto done; } i++; + if (i == rqst->rq_nvec-1) + break; } start = i; buflen = 0; } else { i++; - if (i == rqst->rq_nvec) { + if (i == rqst->rq_nvec-1) { /* send out all remaining vecs */ remaining_data_length -= buflen; log_write(INFO, diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 8f6f25918229..927226a2122f 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -753,7 +753,7 @@ cifs_send_recv(const unsigned int xid, struct cifs_ses *ses, goto out; #ifdef CONFIG_CIFS_SMB311 - if (ses->status == CifsNew) + if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) smb311_update_preauth_hash(ses, rqst->rq_iov+1, rqst->rq_nvec-1); #endif @@ -798,7 +798,7 @@ cifs_send_recv(const unsigned int xid, struct cifs_ses *ses, *resp_buf_type = CIFS_SMALL_BUFFER; #ifdef CONFIG_CIFS_SMB311 - if (ses->status == CifsNew) { + if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) { struct kvec iov = { .iov_base = buf + 4, .iov_len = get_rfc1002_length(buf) @@ -834,8 +834,11 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, if (n_vec + 1 > CIFS_MAX_IOV_SIZE) { new_iov = kmalloc(sizeof(struct kvec) * (n_vec + 1), GFP_KERNEL); - if (!new_iov) + if (!new_iov) { + /* otherwise cifs_send_recv below sets resp_buf_type */ + *resp_buf_type = CIFS_NO_BUFFER; return -ENOMEM; + } } else new_iov = s_iov; diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 017b0ab19bc4..124b093d14e5 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -492,7 +492,7 @@ static void cramfs_kill_sb(struct super_block *sb) { struct cramfs_sb_info *sbi = CRAMFS_SB(sb); - if (IS_ENABLED(CCONFIG_CRAMFS_MTD) && sb->s_mtd) { + if (IS_ENABLED(CONFIG_CRAMFS_MTD) && sb->s_mtd) { if (sbi && sbi->mtd_point_size) mtd_unpoint(sb->s_mtd, 0, sbi->mtd_point_size); kill_mtd_super(sb); diff --git a/fs/dcache.c b/fs/dcache.c index e9476e94372a..0e8e5de3c48a 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -580,6 +580,7 @@ static void __dentry_kill(struct dentry *dentry) spin_unlock(&dentry->d_lock); if (likely(can_free)) dentry_free(dentry); + cond_resched(); } static struct dentry *__lock_parent(struct dentry *dentry) @@ -827,30 +828,24 @@ static inline bool fast_dput(struct dentry *dentry) */ void dput(struct dentry *dentry) { - if (unlikely(!dentry)) - return; + while (dentry) { + might_sleep(); -repeat: - might_sleep(); + rcu_read_lock(); + if (likely(fast_dput(dentry))) { + rcu_read_unlock(); + return; + } - rcu_read_lock(); - if (likely(fast_dput(dentry))) { + /* Slow case: now with the dentry lock held */ rcu_read_unlock(); - return; - } - - /* Slow case: now with the dentry lock held */ - rcu_read_unlock(); - if (likely(retain_dentry(dentry))) { - spin_unlock(&dentry->d_lock); - return; - } + if (likely(retain_dentry(dentry))) { + spin_unlock(&dentry->d_lock); + return; + } - dentry = dentry_kill(dentry); - if (dentry) { - cond_resched(); - goto repeat; + dentry = dentry_kill(dentry); } } EXPORT_SYMBOL(dput); @@ -1066,8 +1061,6 @@ static void shrink_dentry_list(struct list_head *list) while (!list_empty(list)) { struct dentry *dentry, *parent; - cond_resched(); - dentry = list_entry(list->prev, struct dentry, d_lru); spin_lock(&dentry->d_lock); rcu_read_lock(); @@ -1244,13 +1237,11 @@ enum d_walk_ret { * @parent: start of walk * @data: data passed to @enter() and @finish() * @enter: callback when first entering the dentry - * @finish: callback when successfully finished the walk * - * The @enter() and @finish() callbacks are called with d_lock held. + * The @enter() callbacks are called with d_lock held. */ static void d_walk(struct dentry *parent, void *data, - enum d_walk_ret (*enter)(void *, struct dentry *), - void (*finish)(void *)) + enum d_walk_ret (*enter)(void *, struct dentry *)) { struct dentry *this_parent; struct list_head *next; @@ -1339,8 +1330,6 @@ ascend: if (need_seqretry(&rename_lock, seq)) goto rename_retry; rcu_read_unlock(); - if (finish) - finish(data); out_unlock: spin_unlock(&this_parent->d_lock); @@ -1389,7 +1378,7 @@ int path_has_submounts(const struct path *parent) struct check_mount data = { .mnt = parent->mnt, .mounted = 0 }; read_seqlock_excl(&mount_lock); - d_walk(parent->dentry, &data, path_check_mount, NULL); + d_walk(parent->dentry, &data, path_check_mount); read_sequnlock_excl(&mount_lock); return data.mounted; @@ -1497,11 +1486,16 @@ void shrink_dcache_parent(struct dentry *parent) data.start = parent; data.found = 0; - d_walk(parent, &data, select_collect, NULL); + d_walk(parent, &data, select_collect); + + if (!list_empty(&data.dispose)) { + shrink_dentry_list(&data.dispose); + continue; + } + + cond_resched(); if (!data.found) break; - - shrink_dentry_list(&data.dispose); } } EXPORT_SYMBOL(shrink_dcache_parent); @@ -1532,7 +1526,7 @@ static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) static void do_one_tree(struct dentry *dentry) { shrink_dcache_parent(dentry); - d_walk(dentry, dentry, umount_check, NULL); + d_walk(dentry, dentry, umount_check); d_drop(dentry); dput(dentry); } @@ -1556,78 +1550,48 @@ void shrink_dcache_for_umount(struct super_block *sb) } } -struct detach_data { - struct select_data select; - struct dentry *mountpoint; -}; -static enum d_walk_ret detach_and_collect(void *_data, struct dentry *dentry) +static enum d_walk_ret find_submount(void *_data, struct dentry *dentry) { - struct detach_data *data = _data; - + struct dentry **victim = _data; if (d_mountpoint(dentry)) { __dget_dlock(dentry); - data->mountpoint = dentry; + *victim = dentry; return D_WALK_QUIT; } - - return select_collect(&data->select, dentry); -} - -static void check_and_drop(void *_data) -{ - struct detach_data *data = _data; - - if (!data->mountpoint && list_empty(&data->select.dispose)) - __d_drop(data->select.start); + return D_WALK_CONTINUE; } /** * d_invalidate - detach submounts, prune dcache, and drop * @dentry: dentry to invalidate (aka detach, prune and drop) - * - * no dcache lock. - * - * The final d_drop is done as an atomic operation relative to - * rename_lock ensuring there are no races with d_set_mounted. This - * ensures there are no unhashed dentries on the path to a mountpoint. */ void d_invalidate(struct dentry *dentry) { - /* - * If it's already been dropped, return OK. - */ + bool had_submounts = false; spin_lock(&dentry->d_lock); if (d_unhashed(dentry)) { spin_unlock(&dentry->d_lock); return; } + __d_drop(dentry); spin_unlock(&dentry->d_lock); /* Negative dentries can be dropped without further checks */ - if (!dentry->d_inode) { - d_drop(dentry); + if (!dentry->d_inode) return; - } + shrink_dcache_parent(dentry); for (;;) { - struct detach_data data; - - data.mountpoint = NULL; - INIT_LIST_HEAD(&data.select.dispose); - data.select.start = dentry; - data.select.found = 0; - - d_walk(dentry, &data, detach_and_collect, check_and_drop); - - if (!list_empty(&data.select.dispose)) - shrink_dentry_list(&data.select.dispose); - else if (!data.mountpoint) + struct dentry *victim = NULL; + d_walk(dentry, &victim, find_submount); + if (!victim) { + if (had_submounts) + shrink_dcache_parent(dentry); return; - - if (data.mountpoint) { - detach_mounts(data.mountpoint); - dput(data.mountpoint); } + had_submounts = true; + detach_mounts(victim); + dput(victim); } } EXPORT_SYMBOL(d_invalidate); @@ -1913,6 +1877,28 @@ void d_instantiate(struct dentry *entry, struct inode * inode) } EXPORT_SYMBOL(d_instantiate); +/* + * This should be equivalent to d_instantiate() + unlock_new_inode(), + * with lockdep-related part of unlock_new_inode() done before + * anything else. Use that instead of open-coding d_instantiate()/ + * unlock_new_inode() combinations. + */ +void d_instantiate_new(struct dentry *entry, struct inode *inode) +{ + BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); + BUG_ON(!inode); + lockdep_annotate_inode_mutex_key(inode); + security_d_instantiate(entry, inode); + spin_lock(&inode->i_lock); + __d_instantiate(entry, inode); + WARN_ON(!(inode->i_state & I_NEW)); + inode->i_state &= ~I_NEW; + smp_mb(); + wake_up_bit(&inode->i_state, __I_NEW); + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL(d_instantiate_new); + /** * d_instantiate_no_diralias - instantiate a non-aliased dentry * @entry: dentry to complete @@ -3097,7 +3083,7 @@ static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry) void d_genocide(struct dentry *parent) { - d_walk(parent, parent, d_genocide_kill, NULL); + d_walk(parent, parent, d_genocide_kill); } EXPORT_SYMBOL(d_genocide); diff --git a/fs/direct-io.c b/fs/direct-io.c index 874607bb6e02..093fb54cd316 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -432,8 +432,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, struct bio *bio; /* - * bio_alloc() is guaranteed to return a bio when called with - * __GFP_RECLAIM and we request a valid number of vectors. + * bio_alloc() is guaranteed to return a bio when allowed to sleep and + * we request a valid number of vectors. */ bio = bio_alloc(GFP_KERNEL, nr_vecs); diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 846ca150d52e..4dd842f72846 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1997,6 +1997,16 @@ out: return rc; } +static bool is_dot_dotdot(const char *name, size_t name_size) +{ + if (name_size == 1 && name[0] == '.') + return true; + else if (name_size == 2 && name[0] == '.' && name[1] == '.') + return true; + + return false; +} + /** * ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext * @plaintext_name: The plaintext name @@ -2021,13 +2031,21 @@ int ecryptfs_decode_and_decrypt_filename(char **plaintext_name, size_t packet_size; int rc = 0; - if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) - && !(mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) - && (name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) - && (strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX, - ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) == 0)) { - const char *orig_name = name; - size_t orig_name_size = name_size; + if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) && + !(mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)) { + if (is_dot_dotdot(name, name_size)) { + rc = ecryptfs_copy_filename(plaintext_name, + plaintext_name_size, + name, name_size); + goto out; + } + + if (name_size <= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE || + strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX, + ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE)) { + rc = -EINVAL; + goto out; + } name += ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE; name_size -= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE; @@ -2047,12 +2065,9 @@ int ecryptfs_decode_and_decrypt_filename(char **plaintext_name, decoded_name, decoded_name_size); if (rc) { - printk(KERN_INFO "%s: Could not parse tag 70 packet " - "from filename; copying through filename " - "as-is\n", __func__); - rc = ecryptfs_copy_filename(plaintext_name, - plaintext_name_size, - orig_name, orig_name_size); + ecryptfs_printk(KERN_DEBUG, + "%s: Could not parse tag 70 packet from filename\n", + __func__); goto out_free; } } else { diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index c74ed3ca3372..b76a9853325e 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -82,17 +82,28 @@ ecryptfs_filldir(struct dir_context *ctx, const char *lower_name, buf->sb, lower_name, lower_namelen); if (rc) { - printk(KERN_ERR "%s: Error attempting to decode and decrypt " - "filename [%s]; rc = [%d]\n", __func__, lower_name, - rc); - goto out; + if (rc != -EINVAL) { + ecryptfs_printk(KERN_DEBUG, + "%s: Error attempting to decode and decrypt filename [%s]; rc = [%d]\n", + __func__, lower_name, rc); + return rc; + } + + /* Mask -EINVAL errors as these are most likely due a plaintext + * filename present in the lower filesystem despite filename + * encryption being enabled. One unavoidable example would be + * the "lost+found" dentry in the root directory of an Ext4 + * filesystem. + */ + return 0; } + buf->caller->pos = buf->ctx.pos; rc = !dir_emit(buf->caller, name, name_size, ino, d_type); kfree(name); if (!rc) buf->entries_written++; -out: + return rc; } diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 847904aa63a9..49121e5a8de2 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -283,8 +283,7 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, iget_failed(ecryptfs_inode); goto out; } - unlock_new_inode(ecryptfs_inode); - d_instantiate(ecryptfs_dentry, ecryptfs_inode); + d_instantiate_new(ecryptfs_dentry, ecryptfs_inode); out: return rc; } @@ -395,8 +394,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, mount_crypt_stat = &ecryptfs_superblock_to_private( ecryptfs_dentry->d_sb)->mount_crypt_stat; - if (mount_crypt_stat - && (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)) { + if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) { rc = ecryptfs_encrypt_and_encode_filename( &encrypted_and_encoded_name, &len, mount_crypt_stat, name, len); diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index c89a58cfc991..e74fe84d0886 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -1880,7 +1880,7 @@ find_next_matching_auth_tok: candidate_auth_tok = &auth_tok_list_item->auth_tok; if (unlikely(ecryptfs_verbosity > 0)) { ecryptfs_printk(KERN_DEBUG, - "Considering cadidate auth tok:\n"); + "Considering candidate auth tok:\n"); ecryptfs_dump_auth_tok(candidate_auth_tok); } rc = ecryptfs_get_auth_tok_sig(&candidate_auth_tok_sig, diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 3c6a9c156b7a..ddbf87246898 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -790,7 +790,7 @@ int ore_create(struct ore_io_state *ios) for (i = 0; i < ios->oc->numdevs; i++) { struct osd_request *or; - or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); + or = osd_start_request(_ios_od(ios, i)); if (unlikely(!or)) { ORE_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; @@ -815,7 +815,7 @@ int ore_remove(struct ore_io_state *ios) for (i = 0; i < ios->oc->numdevs; i++) { struct osd_request *or; - or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); + or = osd_start_request(_ios_od(ios, i)); if (unlikely(!or)) { ORE_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; @@ -847,7 +847,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; struct osd_request *or; - or = osd_start_request(_ios_od(ios, dev), GFP_KERNEL); + or = osd_start_request(_ios_od(ios, dev)); if (unlikely(!or)) { ORE_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; @@ -966,7 +966,7 @@ int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp) return 0; /* Just an empty slot */ first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1; - or = osd_start_request(_ios_od(ios, first_dev), GFP_KERNEL); + or = osd_start_request(_ios_od(ios, first_dev)); if (unlikely(!or)) { ORE_ERR("%s: osd_start_request failed\n", __func__); return -ENOMEM; @@ -1060,7 +1060,7 @@ static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp, struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; struct osd_request *or; - or = osd_start_request(_ios_od(ios, cur_comp), GFP_KERNEL); + or = osd_start_request(_ios_od(ios, cur_comp)); if (unlikely(!or)) { ORE_ERR("%s: osd_start_request failed\n", __func__); return -ENOMEM; diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 179cd5c2f52a..719a3152da80 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -229,7 +229,7 @@ void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj) static int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, u64 offset, void *p, unsigned length) { - struct osd_request *or = osd_start_request(od, GFP_KERNEL); + struct osd_request *or = osd_start_request(od); /* struct osd_sense_info osi = {.key = 0};*/ int ret; diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 09640220fda8..047c327a6b23 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -88,11 +88,11 @@ out_unlock: * The default page_lock and i_size verification done by non-DAX fault paths * is sufficient because ext2 doesn't support hole punching. */ -static int ext2_dax_fault(struct vm_fault *vmf) +static vm_fault_t ext2_dax_fault(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); struct ext2_inode_info *ei = EXT2_I(inode); - int ret; + vm_fault_t ret; if (vmf->flags & FAULT_FLAG_WRITE) { sb_start_pagefault(inode->i_sb); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 1e01fabef130..71635909df3b 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1264,21 +1264,11 @@ do_indirects: static void ext2_truncate_blocks(struct inode *inode, loff_t offset) { - /* - * XXX: it seems like a bug here that we don't allow - * IS_APPEND inode to have blocks-past-i_size trimmed off. - * review and fix this. - * - * Also would be nice to be able to handle IO errors and such, - * but that's probably too much to ask. - */ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) return; if (ext2_inode_is_fast_symlink(inode)) return; - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return; dax_sem_down_write(EXT2_I(inode)); __ext2_truncate_blocks(inode, offset); diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 55f7caadb093..152453a91877 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -41,8 +41,7 @@ static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode) { int err = ext2_add_link(dentry, inode); if (!err) { - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; } inode_dec_link_count(inode); @@ -255,8 +254,7 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) if (err) goto out_fail; - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); out: return err; diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index a33d8fb1bf2a..508b905d744d 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -321,6 +321,7 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t offset; ext4_grpblk_t next_zero_bit; + ext4_grpblk_t max_bit = EXT4_CLUSTERS_PER_GROUP(sb); ext4_fsblk_t blk; ext4_fsblk_t group_first_block; @@ -338,7 +339,7 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, /* check whether block bitmap block number is set */ blk = ext4_block_bitmap(sb, desc); offset = blk - group_first_block; - if (offset < 0 || EXT4_B2C(sbi, offset) >= sb->s_blocksize || + if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) /* bad block bitmap */ return blk; @@ -346,7 +347,7 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, /* check whether the inode bitmap block number is set */ blk = ext4_inode_bitmap(sb, desc); offset = blk - group_first_block; - if (offset < 0 || EXT4_B2C(sbi, offset) >= sb->s_blocksize || + if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) /* bad block bitmap */ return blk; @@ -354,8 +355,8 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, /* check whether the inode table block number is set */ blk = ext4_inode_table(sb, desc); offset = blk - group_first_block; - if (offset < 0 || EXT4_B2C(sbi, offset) >= sb->s_blocksize || - EXT4_B2C(sbi, offset + sbi->s_itb_per_group) >= sb->s_blocksize) + if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || + EXT4_B2C(sbi, offset + sbi->s_itb_per_group) >= max_bit) return blk; next_zero_bit = ext4_find_next_zero_bit(bh->b_data, EXT4_B2C(sbi, offset + sbi->s_itb_per_group), diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a42e71203e53..229ea4da6785 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2390,7 +2390,7 @@ extern int ext4_init_inode_table(struct super_block *sb, extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); /* mballoc.c */ -extern const struct file_operations ext4_seq_mb_groups_fops; +extern const struct seq_operations ext4_mb_seq_groups_ops; extern long ext4_mb_stats; extern long ext4_mb_max_to_scan; extern int ext4_mb_init(struct super_block *); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 0a7315961bac..c969275ce3ee 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5329,8 +5329,9 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, stop = le32_to_cpu(extent->ee_block); /* - * In case of left shift, Don't start shifting extents until we make - * sure the hole is big enough to accommodate the shift. + * For left shifts, make sure the hole on the left is big enough to + * accommodate the shift. For right shifts, make sure the last extent + * won't be shifted beyond EXT_MAX_BLOCKS. */ if (SHIFT == SHIFT_LEFT) { path = ext4_find_extent(inode, start - 1, &path, @@ -5350,9 +5351,14 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, if ((start == ex_start && shift > ex_start) || (shift > start - ex_end)) { - ext4_ext_drop_refs(path); - kfree(path); - return -EINVAL; + ret = -EINVAL; + goto out; + } + } else { + if (shift > EXT_MAX_BLOCKS - + (stop + ext4_ext_get_actual_len(extent))) { + ret = -EINVAL; + goto out; } } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 769a62708b1c..6884e81c1465 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2254,7 +2254,7 @@ out: static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) { - struct super_block *sb = seq->private; + struct super_block *sb = PDE_DATA(file_inode(seq->file)); ext4_group_t group; if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) @@ -2265,7 +2265,7 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) { - struct super_block *sb = seq->private; + struct super_block *sb = PDE_DATA(file_inode(seq->file)); ext4_group_t group; ++*pos; @@ -2277,7 +2277,7 @@ static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) { - struct super_block *sb = seq->private; + struct super_block *sb = PDE_DATA(file_inode(seq->file)); ext4_group_t group = (ext4_group_t) ((unsigned long) v); int i; int err, buddy_loaded = 0; @@ -2330,34 +2330,13 @@ static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v) { } -static const struct seq_operations ext4_mb_seq_groups_ops = { +const struct seq_operations ext4_mb_seq_groups_ops = { .start = ext4_mb_seq_groups_start, .next = ext4_mb_seq_groups_next, .stop = ext4_mb_seq_groups_stop, .show = ext4_mb_seq_groups_show, }; -static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) -{ - struct super_block *sb = PDE_DATA(inode); - int rc; - - rc = seq_open(file, &ext4_mb_seq_groups_ops); - if (rc == 0) { - struct seq_file *m = file->private_data; - m->private = sb; - } - return rc; - -} - -const struct file_operations ext4_seq_mb_groups_fops = { - .open = ext4_mb_seq_groups_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) { int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index b1f21e3a0763..4a09063ce1d2 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2411,8 +2411,7 @@ static int ext4_add_nondir(handle_t *handle, int err = ext4_add_entry(handle, dentry, inode); if (!err) { ext4_mark_inode_dirty(handle, inode); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; } drop_nlink(inode); @@ -2651,8 +2650,7 @@ out_clear_inode: err = ext4_mark_inode_dirty(handle, dir); if (err) goto out_clear_inode; - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 185f7e61f4cf..eb104e8476f0 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5886,5 +5886,6 @@ static void __exit ext4_exit_fs(void) MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); MODULE_DESCRIPTION("Fourth Extended Filesystem"); MODULE_LICENSE("GPL"); +MODULE_SOFTDEP("pre: crc32c"); module_init(ext4_init_fs) module_exit(ext4_exit_fs) diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 9ebd26c957c2..f34da0bb8f17 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -346,39 +346,9 @@ static struct kobject *ext4_root; static struct kobject *ext4_feat; -#define PROC_FILE_SHOW_DEFN(name) \ -static int name##_open(struct inode *inode, struct file *file) \ -{ \ - return single_open(file, ext4_seq_##name##_show, PDE_DATA(inode)); \ -} \ -\ -static const struct file_operations ext4_seq_##name##_fops = { \ - .open = name##_open, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = single_release, \ -} - -#define PROC_FILE_LIST(name) \ - { __stringify(name), &ext4_seq_##name##_fops } - -PROC_FILE_SHOW_DEFN(es_shrinker_info); -PROC_FILE_SHOW_DEFN(options); - -static const struct ext4_proc_files { - const char *name; - const struct file_operations *fops; -} proc_files[] = { - PROC_FILE_LIST(options), - PROC_FILE_LIST(es_shrinker_info), - PROC_FILE_LIST(mb_groups), - { NULL, NULL }, -}; - int ext4_register_sysfs(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); - const struct ext4_proc_files *p; int err; init_completion(&sbi->s_kobj_unregister); @@ -392,11 +362,14 @@ int ext4_register_sysfs(struct super_block *sb) if (ext4_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); - if (sbi->s_proc) { - for (p = proc_files; p->name; p++) - proc_create_data(p->name, S_IRUGO, sbi->s_proc, - p->fops, sb); + proc_create_single_data("options", S_IRUGO, sbi->s_proc, + ext4_seq_options_show, sb); + proc_create_single_data("es_shrinker_info", S_IRUGO, + sbi->s_proc, ext4_seq_es_shrinker_info_show, + sb); + proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc, + &ext4_mb_seq_groups_ops, sb); } return 0; } @@ -404,13 +377,9 @@ int ext4_register_sysfs(struct super_block *sb) void ext4_unregister_sysfs(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); - const struct ext4_proc_files *p; - if (sbi->s_proc) { - for (p = proc_files; p->name; p++) - remove_proc_entry(p->name, sbi->s_proc); - remove_proc_entry(sb->s_id, ext4_proc_root); - } + if (sbi->s_proc) + remove_proc_subtree(sb->s_id, ext4_proc_root); kobject_del(&sbi->s_kobj); } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index d5098efe577c..75e37fd720b2 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -294,8 +294,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, alloc_nid_done(sbi, ino); - d_instantiate(dentry, inode); - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); @@ -597,8 +596,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, err = page_symlink(inode, disk_link.name, disk_link.len); err_out: - d_instantiate(dentry, inode); - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); /* * Let's flush symlink data in order to avoid broken symlink as much as @@ -661,8 +659,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) alloc_nid_done(sbi, inode->i_ino); - d_instantiate(dentry, inode); - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); @@ -713,8 +710,7 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, alloc_nid_done(sbi, inode->i_ino); - d_instantiate(dentry, inode); - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index f33a56d6e6dd..4b47ca6296a7 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -572,23 +572,6 @@ static int iostat_info_seq_show(struct seq_file *seq, void *offset) return 0; } -#define F2FS_PROC_FILE_DEF(_name) \ -static int _name##_open_fs(struct inode *inode, struct file *file) \ -{ \ - return single_open(file, _name##_seq_show, PDE_DATA(inode)); \ -} \ - \ -static const struct file_operations f2fs_seq_##_name##_fops = { \ - .open = _name##_open_fs, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = single_release, \ -}; - -F2FS_PROC_FILE_DEF(segment_info); -F2FS_PROC_FILE_DEF(segment_bits); -F2FS_PROC_FILE_DEF(iostat_info); - int __init f2fs_init_sysfs(void) { int ret; @@ -632,12 +615,12 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); if (sbi->s_proc) { - proc_create_data("segment_info", S_IRUGO, sbi->s_proc, - &f2fs_seq_segment_info_fops, sb); - proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, - &f2fs_seq_segment_bits_fops, sb); - proc_create_data("iostat_info", S_IRUGO, sbi->s_proc, - &f2fs_seq_iostat_info_fops, sb); + proc_create_single_data("segment_info", S_IRUGO, sbi->s_proc, + segment_info_seq_show, sb); + proc_create_single_data("segment_bits", S_IRUGO, sbi->s_proc, + segment_bits_seq_show, sb); + proc_create_single_data("iostat_info", S_IRUGO, sbi->s_proc, + iostat_info_seq_show, sb); } return 0; } diff --git a/fs/filesystems.c b/fs/filesystems.c index f2728a4a03a1..b03f57b1105b 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -238,21 +238,9 @@ static int filesystems_proc_show(struct seq_file *m, void *v) return 0; } -static int filesystems_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, filesystems_proc_show, NULL); -} - -static const struct file_operations filesystems_proc_fops = { - .open = filesystems_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int __init proc_filesystems_init(void) { - proc_create("filesystems", 0, NULL, &filesystems_proc_fops); + proc_create_single("filesystems", 0, NULL, filesystems_proc_show); return 0; } module_init(proc_filesystems_init); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 4b12ba70a895..471d863958bc 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -745,11 +745,12 @@ int inode_congested(struct inode *inode, int cong_bits) */ if (inode && inode_to_wb_is_valid(inode)) { struct bdi_writeback *wb; - bool locked, congested; + struct wb_lock_cookie lock_cookie = {}; + bool congested; - wb = unlocked_inode_to_wb_begin(inode, &locked); + wb = unlocked_inode_to_wb_begin(inode, &lock_cookie); congested = wb_congested(wb, cong_bits); - unlocked_inode_to_wb_end(inode, locked); + unlocked_inode_to_wb_end(inode, &lock_cookie); return congested; } @@ -1960,7 +1961,7 @@ void wb_workfn(struct work_struct *work) } if (!list_empty(&wb->work_list)) - mod_delayed_work(bdi_wq, &wb->dwork, 0); + wb_wakeup(wb); else if (wb_has_dirty_io(wb) && dirty_writeback_interval) wb_wakeup_delayed(wb); diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c index 15a3d042247e..9a13e9e15b69 100644 --- a/fs/fscache/histogram.c +++ b/fs/fscache/histogram.c @@ -83,24 +83,9 @@ static void fscache_histogram_stop(struct seq_file *m, void *v) { } -static const struct seq_operations fscache_histogram_ops = { +const struct seq_operations fscache_histogram_ops = { .start = fscache_histogram_start, .stop = fscache_histogram_stop, .next = fscache_histogram_next, .show = fscache_histogram_show, }; - -/* - * open "/proc/fs/fscache/histogram" to provide latency data - */ -static int fscache_histogram_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &fscache_histogram_ops); -} - -const struct file_operations fscache_histogram_fops = { - .open = fscache_histogram_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 500650f938fe..f83328a7f048 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -31,6 +31,7 @@ #include <linux/fscache-cache.h> #include <trace/events/fscache.h> #include <linux/sched.h> +#include <linux/seq_file.h> #define FSCACHE_MIN_THREADS 4 #define FSCACHE_MAX_THREADS 32 @@ -84,7 +85,7 @@ static inline void fscache_hist(atomic_t histogram[], unsigned long start_jif) atomic_inc(&histogram[jif]); } -extern const struct file_operations fscache_histogram_fops; +extern const struct seq_operations fscache_histogram_ops; #else #define fscache_hist(hist, start_jif) do {} while (0) @@ -294,7 +295,7 @@ static inline void fscache_stat_d(atomic_t *stat) #define __fscache_stat(stat) (stat) -extern const struct file_operations fscache_stats_fops; +int fscache_stats_show(struct seq_file *m, void *v); #else #define __fscache_stat(stat) (NULL) diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c index 1d9e4951a597..49a8c90414bc 100644 --- a/fs/fscache/proc.c +++ b/fs/fscache/proc.c @@ -26,14 +26,14 @@ int __init fscache_proc_init(void) goto error_dir; #ifdef CONFIG_FSCACHE_STATS - if (!proc_create("fs/fscache/stats", S_IFREG | 0444, NULL, - &fscache_stats_fops)) + if (!proc_create_single("fs/fscache/stats", S_IFREG | 0444, NULL, + fscache_stats_show)) goto error_stats; #endif #ifdef CONFIG_FSCACHE_HISTOGRAM - if (!proc_create("fs/fscache/histogram", S_IFREG | 0444, NULL, - &fscache_histogram_fops)) + if (!proc_create_seq("fs/fscache/histogram", S_IFREG | 0444, NULL, + &fscache_histogram_ops)) goto error_histogram; #endif diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index fcc8c2f2690e..00564a1dfd76 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -138,7 +138,7 @@ atomic_t fscache_n_cache_culled_objects; /* * display the general statistics */ -static int fscache_stats_show(struct seq_file *m, void *v) +int fscache_stats_show(struct seq_file *m, void *v) { seq_puts(m, "FS-Cache statistics\n"); @@ -284,18 +284,3 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_cache_culled_objects)); return 0; } - -/* - * open "/proc/fs/fscache/stats" allowing provision of a statistical summary - */ -static int fscache_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, fscache_stats_show, NULL); -} - -const struct file_operations fscache_stats_fops = { - .open = fscache_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 513c357c734b..a6c0f54c48c3 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -588,6 +588,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) return 0; out_put_hidden_dir: + cancel_delayed_work_sync(&sbi->sync_work); iput(sbi->hidden_dir); out_put_root: dput(sb->s_root); diff --git a/fs/inode.c b/fs/inode.c index 13ceb98c3bd3..3b55391072f3 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -178,6 +178,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) mapping->a_ops = &empty_aops; mapping->host = inode; mapping->flags = 0; + mapping->wb_err = 0; atomic_set(&mapping->i_mmap_writable, 0); mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); mapping->private_data = NULL; diff --git a/fs/internal.h b/fs/internal.h index e08972db0303..980d005b21b4 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -125,6 +125,7 @@ int do_fchmodat(int dfd, const char __user *filename, umode_t mode); int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, int flag); +extern int open_check_o_direct(struct file *f); extern int vfs_open(const struct path *, struct file *, const struct cred *); extern struct file *filp_clone_open(struct file *); diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c index 9bb2fe35799d..10205ececc27 100644 --- a/fs/isofs/compress.c +++ b/fs/isofs/compress.c @@ -20,6 +20,7 @@ #include <linux/init.h> #include <linux/bio.h> +#include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/zlib.h> @@ -59,7 +60,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, >> bufshift; int haveblocks; blkcnt_t blocknum; - struct buffer_head *bhs[needblocks + 1]; + struct buffer_head **bhs; int curbh, curpage; if (block_size > deflateBound(1UL << zisofs_block_shift)) { @@ -80,7 +81,11 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, /* Because zlib is not thread-safe, do all the I/O at the top. */ blocknum = block_start >> bufshift; - memset(bhs, 0, (needblocks + 1) * sizeof(struct buffer_head *)); + bhs = kcalloc(needblocks + 1, sizeof(*bhs), GFP_KERNEL); + if (!bhs) { + *errp = -ENOMEM; + return 0; + } haveblocks = isofs_get_blocks(inode, blocknum, bhs, needblocks); ll_rw_block(REQ_OP_READ, 0, haveblocks, bhs); @@ -190,6 +195,7 @@ z_eio: b_eio: for (i = 0; i < haveblocks; i++) brelse(bhs[i]); + kfree(bhs); return stream.total_out; } @@ -305,7 +311,7 @@ static int zisofs_readpage(struct file *file, struct page *page) unsigned int zisofs_pages_per_cblock = PAGE_SHIFT <= zisofs_block_shift ? (1 << (zisofs_block_shift - PAGE_SHIFT)) : 0; - struct page *pages[max_t(unsigned, zisofs_pages_per_cblock, 1)]; + struct page **pages; pgoff_t index = page->index, end_index; end_index = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -330,6 +336,12 @@ static int zisofs_readpage(struct file *file, struct page *page) full_page = 0; pcount = 1; } + pages = kcalloc(max_t(unsigned int, zisofs_pages_per_cblock, 1), + sizeof(*pages), GFP_KERNEL); + if (!pages) { + unlock_page(page); + return -ENOMEM; + } pages[full_page] = page; for (i = 0; i < pcount; i++, index++) { @@ -357,6 +369,7 @@ static int zisofs_readpage(struct file *file, struct page *page) } /* At this point, err contains 0 or -EIO depending on the "critical" page */ + kfree(pages); return err; } diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index bc258a4402f6..ec3fba7d492f 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -394,7 +394,10 @@ static int parse_options(char *options, struct iso9660_options *popt) break; #ifdef CONFIG_JOLIET case Opt_iocharset: + kfree(popt->iocharset); popt->iocharset = match_strdup(&args[0]); + if (!popt->iocharset) + return 0; break; #endif case Opt_map_a: diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index ac311037d7a5..8aa453784402 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -532,6 +532,7 @@ int jbd2_journal_start_reserved(handle_t *handle, unsigned int type, */ ret = start_this_handle(journal, handle, GFP_NOFS); if (ret < 0) { + handle->h_journal = journal; jbd2_journal_free_reserved(handle); return ret; } diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 0a754f38462e..e5a6deb38e1e 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -209,8 +209,7 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, __func__, inode->i_ino, inode->i_mode, inode->i_nlink, f->inocache->pino_nlink, inode->i_mapping->nrpages); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; fail: @@ -430,8 +429,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char mutex_unlock(&dir_f->sem); jffs2_complete_reservation(c); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; fail: @@ -575,8 +573,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, umode_t mode mutex_unlock(&dir_f->sem); jffs2_complete_reservation(c); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; fail: @@ -747,8 +744,7 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, umode_t mode mutex_unlock(&dir_f->sem); jffs2_complete_reservation(c); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; fail: diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index f60dee7faf03..87bdf0f4cba1 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -342,7 +342,7 @@ static void jffs2_put_super (struct super_block *sb) static void jffs2_kill_sb(struct super_block *sb) { struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); - if (!sb_rdonly(sb)) + if (c && !sb_rdonly(sb)) jffs2_stop_garbage_collect_thread(c); kill_mtd_super(sb); kfree(c); diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c index a70907606025..35a5b2a81ae0 100644 --- a/fs/jfs/jfs_debug.c +++ b/fs/jfs/jfs_debug.c @@ -29,7 +29,6 @@ #ifdef PROC_FS_JFS /* see jfs_debug.h */ -static struct proc_dir_entry *base; #ifdef CONFIG_JFS_DEBUG static int jfs_loglevel_proc_show(struct seq_file *m, void *v) { @@ -66,43 +65,29 @@ static const struct file_operations jfs_loglevel_proc_fops = { }; #endif -static struct { - const char *name; - const struct file_operations *proc_fops; -} Entries[] = { -#ifdef CONFIG_JFS_STATISTICS - { "lmstats", &jfs_lmstats_proc_fops, }, - { "txstats", &jfs_txstats_proc_fops, }, - { "xtstat", &jfs_xtstat_proc_fops, }, - { "mpstat", &jfs_mpstat_proc_fops, }, -#endif -#ifdef CONFIG_JFS_DEBUG - { "TxAnchor", &jfs_txanchor_proc_fops, }, - { "loglevel", &jfs_loglevel_proc_fops } -#endif -}; -#define NPROCENT ARRAY_SIZE(Entries) - void jfs_proc_init(void) { - int i; + struct proc_dir_entry *base; - if (!(base = proc_mkdir("fs/jfs", NULL))) + base = proc_mkdir("fs/jfs", NULL); + if (!base) return; - for (i = 0; i < NPROCENT; i++) - proc_create(Entries[i].name, 0, base, Entries[i].proc_fops); +#ifdef CONFIG_JFS_STATISTICS + proc_create_single("lmstats", 0, base, jfs_lmstats_proc_show); + proc_create_single("txstats", 0, base, jfs_txstats_proc_show); + proc_create_single("xtstat", 0, base, jfs_xtstat_proc_show); + proc_create_single("mpstat", 0, base, jfs_mpstat_proc_show); +#endif +#ifdef CONFIG_JFS_DEBUG + proc_create_single("TxAnchor", 0, base, jfs_txanchor_proc_show); + proc_create("loglevel", 0, base, &jfs_loglevel_proc_fops); +#endif } void jfs_proc_clean(void) { - int i; - - if (base) { - for (i = 0; i < NPROCENT; i++) - remove_proc_entry(Entries[i].name, base); - remove_proc_entry("fs/jfs", NULL); - } + remove_proc_subtree("fs/jfs", NULL); } #endif /* PROC_FS_JFS */ diff --git a/fs/jfs/jfs_debug.h b/fs/jfs/jfs_debug.h index eafd1300a00b..0d9e35da8462 100644 --- a/fs/jfs/jfs_debug.h +++ b/fs/jfs/jfs_debug.h @@ -62,7 +62,7 @@ extern void jfs_proc_clean(void); extern int jfsloglevel; -extern const struct file_operations jfs_txanchor_proc_fops; +int jfs_txanchor_proc_show(struct seq_file *m, void *v); /* information message: e.g., configuration, major event */ #define jfs_info(fmt, arg...) do { \ @@ -105,10 +105,10 @@ extern const struct file_operations jfs_txanchor_proc_fops; * ---------- */ #ifdef CONFIG_JFS_STATISTICS -extern const struct file_operations jfs_lmstats_proc_fops; -extern const struct file_operations jfs_txstats_proc_fops; -extern const struct file_operations jfs_mpstat_proc_fops; -extern const struct file_operations jfs_xtstat_proc_fops; +int jfs_lmstats_proc_show(struct seq_file *m, void *v); +int jfs_txstats_proc_show(struct seq_file *m, void *v); +int jfs_mpstat_proc_show(struct seq_file *m, void *v); +int jfs_xtstat_proc_show(struct seq_file *m, void *v); #define INCREMENT(x) ((x)++) #define DECREMENT(x) ((x)--) diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 0e5d412c0b01..6b68df395892 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -2493,7 +2493,7 @@ exit: } #ifdef CONFIG_JFS_STATISTICS -static int jfs_lmstats_proc_show(struct seq_file *m, void *v) +int jfs_lmstats_proc_show(struct seq_file *m, void *v) { seq_printf(m, "JFS Logmgr stats\n" @@ -2510,16 +2510,4 @@ static int jfs_lmstats_proc_show(struct seq_file *m, void *v) lmStat.partial_page); return 0; } - -static int jfs_lmstats_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, jfs_lmstats_proc_show, NULL); -} - -const struct file_operations jfs_lmstats_proc_fops = { - .open = jfs_lmstats_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; #endif /* CONFIG_JFS_STATISTICS */ diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 1a3b0cc22ad3..fa2c6824c7f2 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -815,7 +815,7 @@ void __invalidate_metapages(struct inode *ip, s64 addr, int len) } #ifdef CONFIG_JFS_STATISTICS -static int jfs_mpstat_proc_show(struct seq_file *m, void *v) +int jfs_mpstat_proc_show(struct seq_file *m, void *v) { seq_printf(m, "JFS Metapage statistics\n" @@ -828,16 +828,4 @@ static int jfs_mpstat_proc_show(struct seq_file *m, void *v) mpStat.lockwait); return 0; } - -static int jfs_mpstat_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, jfs_mpstat_proc_show, NULL); -} - -const struct file_operations jfs_mpstat_proc_fops = { - .open = jfs_mpstat_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; #endif diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index 4d973524c887..a5663cb621d8 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -2998,7 +2998,7 @@ int jfs_sync(void *arg) } #if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_DEBUG) -static int jfs_txanchor_proc_show(struct seq_file *m, void *v) +int jfs_txanchor_proc_show(struct seq_file *m, void *v) { char *freewait; char *freelockwait; @@ -3032,22 +3032,10 @@ static int jfs_txanchor_proc_show(struct seq_file *m, void *v) list_empty(&TxAnchor.unlock_queue) ? "" : "not "); return 0; } - -static int jfs_txanchor_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, jfs_txanchor_proc_show, NULL); -} - -const struct file_operations jfs_txanchor_proc_fops = { - .open = jfs_txanchor_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; #endif #if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_STATISTICS) -static int jfs_txstats_proc_show(struct seq_file *m, void *v) +int jfs_txstats_proc_show(struct seq_file *m, void *v) { seq_printf(m, "JFS TxStats\n" @@ -3072,16 +3060,4 @@ static int jfs_txstats_proc_show(struct seq_file *m, void *v) TxStat.txLockAlloc_freelock); return 0; } - -static int jfs_txstats_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, jfs_txstats_proc_show, NULL); -} - -const struct file_operations jfs_txstats_proc_fops = { - .open = jfs_txstats_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; #endif diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c index 5cde6d2fcfca..2c200b5256a6 100644 --- a/fs/jfs/jfs_xtree.c +++ b/fs/jfs/jfs_xtree.c @@ -3874,7 +3874,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size) } #ifdef CONFIG_JFS_STATISTICS -static int jfs_xtstat_proc_show(struct seq_file *m, void *v) +int jfs_xtstat_proc_show(struct seq_file *m, void *v) { seq_printf(m, "JFS Xtree statistics\n" @@ -3887,16 +3887,4 @@ static int jfs_xtstat_proc_show(struct seq_file *m, void *v) xtStat.split); return 0; } - -static int jfs_xtstat_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, jfs_xtstat_proc_show, NULL); -} - -const struct file_operations jfs_xtstat_proc_fops = { - .open = jfs_xtstat_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; #endif diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index b41596d71858..56c3fcbfe80e 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -178,8 +178,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, umode_t mode, unlock_new_inode(ip); iput(ip); } else { - unlock_new_inode(ip); - d_instantiate(dentry, ip); + d_instantiate_new(dentry, ip); } out2: @@ -313,8 +312,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) unlock_new_inode(ip); iput(ip); } else { - unlock_new_inode(ip); - d_instantiate(dentry, ip); + d_instantiate_new(dentry, ip); } out2: @@ -1059,8 +1057,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, unlock_new_inode(ip); iput(ip); } else { - unlock_new_inode(ip); - d_instantiate(dentry, ip); + d_instantiate_new(dentry, ip); } out2: @@ -1447,8 +1444,7 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry, unlock_new_inode(ip); iput(ip); } else { - unlock_new_inode(ip); - d_instantiate(dentry, ip); + d_instantiate_new(dentry, ip); } out1: diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 26dd9a50f383..ff2716f9322e 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -316,6 +316,7 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, info->root = root; info->ns = ns; + INIT_LIST_HEAD(&info->node); sb = sget_userns(fs_type, kernfs_test_super, kernfs_set_super, flags, &init_user_ns, info); diff --git a/fs/locks.c b/fs/locks.c index 62bbe8b31f26..05e211be8684 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2788,22 +2788,10 @@ static const struct seq_operations locks_seq_operations = { .show = locks_show, }; -static int locks_open(struct inode *inode, struct file *filp) -{ - return seq_open_private(filp, &locks_seq_operations, - sizeof(struct locks_iterator)); -} - -static const struct file_operations proc_locks_operations = { - .open = locks_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, -}; - static int __init proc_locks_init(void) { - proc_create("locks", 0, NULL, &proc_locks_operations); + proc_create_seq_private("locks", 0, NULL, &locks_seq_operations, + sizeof(struct locks_iterator), NULL); return 0; } fs_initcall(proc_locks_init); diff --git a/fs/namei.c b/fs/namei.c index 6f0dc40f88c5..a59968de1636 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3365,7 +3365,9 @@ finish_open_created: goto out; *opened |= FILE_OPENED; opened: - error = ima_file_check(file, op->acc_mode, *opened); + error = open_check_o_direct(file); + if (!error) + error = ima_file_check(file, op->acc_mode, *opened); if (!error && will_truncate) error = handle_truncate(file); out: @@ -3445,6 +3447,9 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags, error = finish_open(file, child, NULL, opened); if (error) goto out2; + error = open_check_o_direct(file); + if (error) + fput(file); out2: mnt_drop_write(path.mnt); out: @@ -3845,11 +3850,11 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) if (error) goto out; - shrink_dcache_parent(dentry); error = dir->i_op->rmdir(dir, dentry); if (error) goto out; + shrink_dcache_parent(dentry); dentry->d_inode->i_flags |= S_DEAD; dont_mount(dentry); detach_mounts(dentry); @@ -4432,8 +4437,6 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, old_dir->i_nlink >= max_links) goto out; } - if (is_dir && !(flags & RENAME_EXCHANGE) && target) - shrink_dcache_parent(new_dentry); if (!is_dir) { error = try_break_deleg(source, delegated_inode); if (error) @@ -4450,8 +4453,10 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; if (!(flags & RENAME_EXCHANGE) && target) { - if (is_dir) + if (is_dir) { + shrink_dcache_parent(new_dentry); target->i_flags |= S_DEAD; + } dont_mount(new_dentry); detach_mounts(new_dentry); } diff --git a/fs/namespace.c b/fs/namespace.c index e398f32d7541..5f75969adff1 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1089,7 +1089,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, goto out_free; } - mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED); + mnt->mnt.mnt_flags = old->mnt.mnt_flags; + mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL); /* Don't allow unprivileged users to change mount flags */ if (flag & CL_UNPRIVILEGED) { mnt->mnt.mnt_flags |= MNT_LOCK_ATIME; @@ -2814,7 +2815,7 @@ long do_mount(const char *dev_name, const char __user *dir_name, mnt_flags |= MNT_NODIRATIME; if (flags & MS_STRICTATIME) mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); - if (flags & SB_RDONLY) + if (flags & MS_RDONLY) mnt_flags |= MNT_READONLY; /* The default atime for remount is preservation */ diff --git a/fs/nfs/client.c b/fs/nfs/client.c index b9129e2befea..bbc91d7ca1bd 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1067,7 +1067,6 @@ void nfs_clients_init(struct net *net) } #ifdef CONFIG_PROC_FS -static int nfs_server_list_open(struct inode *inode, struct file *file); static void *nfs_server_list_start(struct seq_file *p, loff_t *pos); static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos); static void nfs_server_list_stop(struct seq_file *p, void *v); @@ -1080,14 +1079,6 @@ static const struct seq_operations nfs_server_list_ops = { .show = nfs_server_list_show, }; -static const struct file_operations nfs_server_list_fops = { - .open = nfs_server_list_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - -static int nfs_volume_list_open(struct inode *inode, struct file *file); static void *nfs_volume_list_start(struct seq_file *p, loff_t *pos); static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos); static void nfs_volume_list_stop(struct seq_file *p, void *v); @@ -1100,23 +1091,6 @@ static const struct seq_operations nfs_volume_list_ops = { .show = nfs_volume_list_show, }; -static const struct file_operations nfs_volume_list_fops = { - .open = nfs_volume_list_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - -/* - * open "/proc/fs/nfsfs/servers" which provides a summary of servers with which - * we're dealing - */ -static int nfs_server_list_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &nfs_server_list_ops, - sizeof(struct seq_net_private)); -} - /* * set up the iterator to start reading from the server list and return the first item */ @@ -1185,15 +1159,6 @@ static int nfs_server_list_show(struct seq_file *m, void *v) } /* - * open "/proc/fs/nfsfs/volumes" which provides a summary of extant volumes - */ -static int nfs_volume_list_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &nfs_volume_list_ops, - sizeof(struct seq_net_private)); -} - -/* * set up the iterator to start reading from the volume list and return the first item */ static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos) @@ -1278,14 +1243,14 @@ int nfs_fs_proc_net_init(struct net *net) goto error_0; /* a file of servers with which we're dealing */ - p = proc_create("servers", S_IFREG|S_IRUGO, - nn->proc_nfsfs, &nfs_server_list_fops); + p = proc_create_net("servers", S_IFREG|S_IRUGO, nn->proc_nfsfs, + &nfs_server_list_ops, sizeof(struct seq_net_private)); if (!p) goto error_1; /* a file of volumes that we have mounted */ - p = proc_create("volumes", S_IFREG|S_IRUGO, - nn->proc_nfsfs, &nfs_volume_list_fops); + p = proc_create_net("volumes", S_IFREG|S_IRUGO, nn->proc_nfsfs, + &nfs_volume_list_ops, sizeof(struct seq_net_private)); if (!p) goto error_1; return 0; diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 70b8bf781fce..a43dfedd69ec 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -227,7 +227,7 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev, if (!buf) return -ENOMEM; - rq = blk_get_request(q, REQ_OP_SCSI_IN, GFP_KERNEL); + rq = blk_get_request(q, REQ_OP_SCSI_IN, 0); if (IS_ERR(rq)) { error = -ENOMEM; goto out_free_buf; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 2410b093a2e6..b0555d7d8200 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1201,6 +1201,28 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, break; case S_IFDIR: host_err = vfs_mkdir(dirp, dchild, iap->ia_mode); + if (!host_err && unlikely(d_unhashed(dchild))) { + struct dentry *d; + d = lookup_one_len(dchild->d_name.name, + dchild->d_parent, + dchild->d_name.len); + if (IS_ERR(d)) { + host_err = PTR_ERR(d); + break; + } + if (unlikely(d_is_negative(d))) { + dput(d); + err = nfserr_serverfault; + goto out; + } + dput(resfhp->fh_dentry); + resfhp->fh_dentry = dget(d); + err = fh_update(resfhp); + dput(dchild); + dchild = d; + if (err) + goto out; + } break; case S_IFCHR: case S_IFBLK: diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 1a2894aa0194..dd52d3f82e8d 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -46,8 +46,7 @@ static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode) int err = nilfs_add_link(dentry, inode); if (!err) { - d_instantiate(dentry, inode); - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); return 0; } inode_dec_link_count(inode); @@ -243,8 +242,7 @@ static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) goto out_fail; nilfs_mark_inode_dirty(inode); - d_instantiate(dentry, inode); - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); out: if (!err) err = nilfs_transaction_commit(dir->i_sb); diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index d51e1bb781cf..d94e8031fe5f 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -92,7 +92,7 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, u32 event_mask, const void *data, int data_type) { - __u32 marks_mask, marks_ignored_mask; + __u32 marks_mask = 0, marks_ignored_mask = 0; const struct path *path = data; pr_debug("%s: inode_mark=%p vfsmnt_mark=%p mask=%x data=%p" @@ -108,24 +108,20 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, !d_can_lookup(path->dentry)) return false; - if (inode_mark && vfsmnt_mark) { - marks_mask = (vfsmnt_mark->mask | inode_mark->mask); - marks_ignored_mask = (vfsmnt_mark->ignored_mask | inode_mark->ignored_mask); - } else if (inode_mark) { - /* - * if the event is for a child and this inode doesn't care about - * events on the child, don't send it! - */ - if ((event_mask & FS_EVENT_ON_CHILD) && - !(inode_mark->mask & FS_EVENT_ON_CHILD)) - return false; - marks_mask = inode_mark->mask; - marks_ignored_mask = inode_mark->ignored_mask; - } else if (vfsmnt_mark) { - marks_mask = vfsmnt_mark->mask; - marks_ignored_mask = vfsmnt_mark->ignored_mask; - } else { - BUG(); + /* + * if the event is for a child and this inode doesn't care about + * events on the child, don't send it! + */ + if (inode_mark && + (!(event_mask & FS_EVENT_ON_CHILD) || + (inode_mark->mask & FS_EVENT_ON_CHILD))) { + marks_mask |= inode_mark->mask; + marks_ignored_mask |= inode_mark->ignored_mask; + } + + if (vfsmnt_mark) { + marks_mask |= vfsmnt_mark->mask; + marks_ignored_mask |= vfsmnt_mark->ignored_mask; } if (d_is_dir(path->dentry) && diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 219b269c737e..613ec7e5a465 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -192,8 +192,9 @@ static int send_to_group(struct inode *to_tell, struct fsnotify_iter_info *iter_info) { struct fsnotify_group *group = NULL; - __u32 inode_test_mask = 0; - __u32 vfsmount_test_mask = 0; + __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD); + __u32 marks_mask = 0; + __u32 marks_ignored_mask = 0; if (unlikely(!inode_mark && !vfsmount_mark)) { BUG(); @@ -213,29 +214,25 @@ static int send_to_group(struct inode *to_tell, /* does the inode mark tell us to do something? */ if (inode_mark) { group = inode_mark->group; - inode_test_mask = (mask & ~FS_EVENT_ON_CHILD); - inode_test_mask &= inode_mark->mask; - inode_test_mask &= ~inode_mark->ignored_mask; + marks_mask |= inode_mark->mask; + marks_ignored_mask |= inode_mark->ignored_mask; } /* does the vfsmount_mark tell us to do something? */ if (vfsmount_mark) { - vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD); group = vfsmount_mark->group; - vfsmount_test_mask &= vfsmount_mark->mask; - vfsmount_test_mask &= ~vfsmount_mark->ignored_mask; - if (inode_mark) - vfsmount_test_mask &= ~inode_mark->ignored_mask; + marks_mask |= vfsmount_mark->mask; + marks_ignored_mask |= vfsmount_mark->ignored_mask; } pr_debug("%s: group=%p to_tell=%p mask=%x inode_mark=%p" - " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x" + " vfsmount_mark=%p marks_mask=%x marks_ignored_mask=%x" " data=%p data_is=%d cookie=%d\n", - __func__, group, to_tell, mask, inode_mark, - inode_test_mask, vfsmount_mark, vfsmount_test_mask, data, + __func__, group, to_tell, mask, inode_mark, vfsmount_mark, + marks_mask, marks_ignored_mask, data, data_is, cookie); - if (!inode_test_mask && !vfsmount_test_mask) + if (!(test_mask & marks_mask & ~marks_ignored_mask)) return 0; return group->ops->handle_event(group, to_tell, inode_mark, diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 91a8889abf9b..ea8c551bcd7e 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -570,16 +570,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, current_page, vec_len, vec_start); len = bio_add_page(bio, page, vec_len, vec_start); - if (len != vec_len) { - mlog(ML_ERROR, "Adding page[%d] to bio failed, " - "page %p, len %d, vec_len %u, vec_start %u, " - "bi_sector %llu\n", current_page, page, len, - vec_len, vec_start, - (unsigned long long)bio->bi_iter.bi_sector); - bio_put(bio); - bio = ERR_PTR(-EIO); - return bio; - } + if (len != vec_len) break; cs += vec_len / (PAGE_SIZE/spp); vec_start = 0; diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 01c6b3894406..7869622af22a 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4250,10 +4250,11 @@ out: static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, bool preserve) { - int error; + int error, had_lock; struct inode *inode = d_inode(old_dentry); struct buffer_head *old_bh = NULL; struct inode *new_orphan_inode = NULL; + struct ocfs2_lock_holder oh; if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) return -EOPNOTSUPP; @@ -4295,6 +4296,14 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, goto out; } + had_lock = ocfs2_inode_lock_tracker(new_orphan_inode, NULL, 1, + &oh); + if (had_lock < 0) { + error = had_lock; + mlog_errno(error); + goto out; + } + /* If the security isn't preserved, we need to re-initialize them. */ if (!preserve) { error = ocfs2_init_security_and_acl(dir, new_orphan_inode, @@ -4302,14 +4311,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, if (error) mlog_errno(error); } -out: if (!error) { error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, new_dentry); if (error) mlog_errno(error); } + ocfs2_inode_unlock_tracker(new_orphan_inode, 1, &oh, had_lock); +out: if (new_orphan_inode) { /* * We need to open_unlock the inode no matter whether we diff --git a/fs/open.c b/fs/open.c index c5ee7cd60424..d0e955b558ad 100644 --- a/fs/open.c +++ b/fs/open.c @@ -724,6 +724,16 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) return ksys_fchown(fd, user, group); } +int open_check_o_direct(struct file *f) +{ + /* NB: we're sure to have correct a_ops only after f_op->open */ + if (f->f_flags & O_DIRECT) { + if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO) + return -EINVAL; + } + return 0; +} + static int do_dentry_open(struct file *f, struct inode *inode, int (*open)(struct inode *, struct file *), @@ -745,7 +755,7 @@ static int do_dentry_open(struct file *f, if (unlikely(f->f_flags & O_PATH)) { f->f_mode = FMODE_PATH; f->f_op = &empty_fops; - goto done; + return 0; } if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { @@ -798,12 +808,7 @@ static int do_dentry_open(struct file *f, f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); -done: - /* NB: we're sure to have correct a_ops only after f_op->open */ - error = -EINVAL; - if ((f->f_flags & O_DIRECT) && - (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)) - goto out_fput; + return 0; cleanup_all: @@ -818,9 +823,6 @@ cleanup_file: f->f_path.dentry = NULL; f->f_inode = NULL; return error; -out_fput: - fput(f); - return error; } /** @@ -918,14 +920,20 @@ struct file *dentry_open(const struct path *path, int flags, BUG_ON(!path->mnt); f = get_empty_filp(); - if (IS_ERR(f)) - return f; - - f->f_flags = flags; - error = vfs_open(path, f, cred); - if (error) { - put_filp(f); - return ERR_PTR(error); + if (!IS_ERR(f)) { + f->f_flags = flags; + error = vfs_open(path, f, cred); + if (!error) { + /* from now on we need fput() to dispose of f */ + error = open_check_o_direct(f); + if (error) { + fput(f); + f = ERR_PTR(error); + } + } else { + put_filp(f); + f = ERR_PTR(error); + } } return f; } diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index 6e3134e6d98a..1b5707c44c3f 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -75,8 +75,7 @@ static int orangefs_create(struct inode *dir, get_khandle_from_ino(inode), dentry); - d_instantiate(dentry, inode); - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); orangefs_set_timeout(dentry); ORANGEFS_I(inode)->getattr_time = jiffies - 1; ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS; @@ -332,8 +331,7 @@ static int orangefs_symlink(struct inode *dir, "Assigned symlink inode new number of %pU\n", get_khandle_from_ino(inode)); - d_instantiate(dentry, inode); - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); orangefs_set_timeout(dentry); ORANGEFS_I(inode)->getattr_time = jiffies - 1; ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS; @@ -402,8 +400,7 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode "Assigned dir inode new number of %pU\n", get_khandle_from_ino(inode)); - d_instantiate(dentry, inode); - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); orangefs_set_timeout(dentry); ORANGEFS_I(inode)->getattr_time = jiffies - 1; ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS; diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index 3ae5fdba0225..10796d3fe27d 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -579,6 +579,11 @@ void orangefs_kill_sb(struct super_block *sb) /* provided sb cleanup */ kill_anon_super(sb); + if (!ORANGEFS_SB(sb)) { + mutex_lock(&orangefs_request_mutex); + mutex_unlock(&orangefs_request_mutex); + return; + } /* * issue the unmount to userspace to tell it to remove the * dynamic mount info it has for this superblock diff --git a/fs/proc/array.c b/fs/proc/array.c index ae2c807fd719..e6d7f41b6684 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -85,6 +85,7 @@ #include <linux/delayacct.h> #include <linux/seq_file.h> #include <linux/pid_namespace.h> +#include <linux/prctl.h> #include <linux/ptrace.h> #include <linux/tracehook.h> #include <linux/string_helpers.h> @@ -335,6 +336,30 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p) #ifdef CONFIG_SECCOMP seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode); #endif + seq_printf(m, "\nSpeculation_Store_Bypass:\t"); + switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) { + case -EINVAL: + seq_printf(m, "unknown"); + break; + case PR_SPEC_NOT_AFFECTED: + seq_printf(m, "not vulnerable"); + break; + case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE: + seq_printf(m, "thread force mitigated"); + break; + case PR_SPEC_PRCTL | PR_SPEC_DISABLE: + seq_printf(m, "thread mitigated"); + break; + case PR_SPEC_PRCTL | PR_SPEC_ENABLE: + seq_printf(m, "thread vulnerable"); + break; + case PR_SPEC_DISABLE: + seq_printf(m, "globally mitigated"); + break; + default: + seq_printf(m, "vulnerable"); + break; + } seq_putc(m, '\n'); } @@ -677,25 +702,22 @@ out: static int children_seq_show(struct seq_file *seq, void *v) { - struct inode *inode = seq->private; - pid_t pid; - - pid = pid_nr_ns(v, inode->i_sb->s_fs_info); - seq_printf(seq, "%d ", pid); + struct inode *inode = file_inode(seq->file); + seq_printf(seq, "%d ", pid_nr_ns(v, proc_pid_ns(inode))); return 0; } static void *children_seq_start(struct seq_file *seq, loff_t *pos) { - return get_children_pid(seq->private, NULL, *pos); + return get_children_pid(file_inode(seq->file), NULL, *pos); } static void *children_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct pid *pid; - pid = get_children_pid(seq->private, v, *pos + 1); + pid = get_children_pid(file_inode(seq->file), v, *pos + 1); put_pid(v); ++*pos; @@ -716,17 +738,7 @@ static const struct seq_operations children_seq_ops = { static int children_seq_open(struct inode *inode, struct file *file) { - struct seq_file *m; - int ret; - - ret = seq_open(file, &children_seq_ops); - if (ret) - return ret; - - m = file->private_data; - m->private = inode; - - return ret; + return seq_open(file, &children_seq_ops); } const struct file_operations proc_tid_children_operations = { diff --git a/fs/proc/base.c b/fs/proc/base.c index eafa39a3a88c..4e35593546b1 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -261,7 +261,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, * Inherently racy -- command line shares address space * with code and data. */ - rv = access_remote_vm(mm, arg_end - 1, &c, 1, 0); + rv = access_remote_vm(mm, arg_end - 1, &c, 1, FOLL_ANON); if (rv <= 0) goto out_free_page; @@ -279,7 +279,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, int nr_read; _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, 0); + nr_read = access_remote_vm(mm, p, page, _count, FOLL_ANON); if (nr_read < 0) rv = nr_read; if (nr_read <= 0) @@ -325,7 +325,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, bool final; _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, 0); + nr_read = access_remote_vm(mm, p, page, _count, FOLL_ANON); if (nr_read < 0) rv = nr_read; if (nr_read <= 0) @@ -698,7 +698,7 @@ static bool has_pid_permissions(struct pid_namespace *pid, static int proc_pid_permission(struct inode *inode, int mask) { - struct pid_namespace *pid = inode->i_sb->s_fs_info; + struct pid_namespace *pid = proc_pid_ns(inode); struct task_struct *task; bool has_perms; @@ -733,13 +733,11 @@ static const struct inode_operations proc_def_inode_operations = { static int proc_single_show(struct seq_file *m, void *v) { struct inode *inode = m->private; - struct pid_namespace *ns; - struct pid *pid; + struct pid_namespace *ns = proc_pid_ns(inode); + struct pid *pid = proc_pid(inode); struct task_struct *task; int ret; - ns = inode->i_sb->s_fs_info; - pid = proc_pid(inode); task = get_pid_task(pid, PIDTYPE_PID); if (!task) return -ESRCH; @@ -946,7 +944,7 @@ static ssize_t environ_read(struct file *file, char __user *buf, max_len = min_t(size_t, PAGE_SIZE, count); this_len = min(max_len, this_len); - retval = access_remote_vm(mm, (env_start + src), page, this_len, 0); + retval = access_remote_vm(mm, (env_start + src), page, this_len, FOLL_ANON); if (retval <= 0) { ret = retval; @@ -1410,7 +1408,7 @@ static const struct file_operations proc_fail_nth_operations = { static int sched_show(struct seq_file *m, void *v) { struct inode *inode = m->private; - struct pid_namespace *ns = inode->i_sb->s_fs_info; + struct pid_namespace *ns = proc_pid_ns(inode); struct task_struct *p; p = get_proc_task(inode); @@ -1693,6 +1691,12 @@ void task_dump_owner(struct task_struct *task, umode_t mode, kuid_t uid; kgid_t gid; + if (unlikely(task->flags & PF_KTHREAD)) { + *ruid = GLOBAL_ROOT_UID; + *rgid = GLOBAL_ROOT_GID; + return; + } + /* Default to the tasks effective ownership */ rcu_read_lock(); cred = __task_cred(task); @@ -1776,8 +1780,8 @@ int pid_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); + struct pid_namespace *pid = proc_pid_ns(inode); struct task_struct *task; - struct pid_namespace *pid = path->dentry->d_sb->s_fs_info; generic_fillattr(inode, stat); @@ -2331,7 +2335,7 @@ static int proc_timers_open(struct inode *inode, struct file *file) return -ENOMEM; tp->pid = proc_pid(inode); - tp->ns = inode->i_sb->s_fs_info; + tp->ns = proc_pid_ns(inode); return 0; } @@ -3233,7 +3237,7 @@ retry: int proc_pid_readdir(struct file *file, struct dir_context *ctx) { struct tgid_iter iter; - struct pid_namespace *ns = file_inode(file)->i_sb->s_fs_info; + struct pid_namespace *ns = proc_pid_ns(file_inode(file)); loff_t pos = ctx->pos; if (pos >= PID_MAX_LIMIT + TGID_OFFSET) @@ -3582,7 +3586,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) /* f_version caches the tgid value that the last readdir call couldn't * return. lseek aka telldir automagically resets f_version to 0. */ - ns = inode->i_sb->s_fs_info; + ns = proc_pid_ns(inode); tid = (int)file->f_version; file->f_version = 0; for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns); diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c index 8233e7af9389..fa762c5fbcb2 100644 --- a/fs/proc/cmdline.c +++ b/fs/proc/cmdline.c @@ -11,21 +11,9 @@ static int cmdline_proc_show(struct seq_file *m, void *v) return 0; } -static int cmdline_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, cmdline_proc_show, NULL); -} - -static const struct file_operations cmdline_proc_fops = { - .open = cmdline_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int __init proc_cmdline_init(void) { - proc_create("cmdline", 0, NULL, &cmdline_proc_fops); + proc_create_single("cmdline", 0, NULL, cmdline_proc_show); return 0; } fs_initcall(proc_cmdline_init); diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c index a8ac48aebd59..954caf0b7fee 100644 --- a/fs/proc/consoles.c +++ b/fs/proc/consoles.c @@ -91,21 +91,9 @@ static const struct seq_operations consoles_op = { .show = show_console_dev }; -static int consoles_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &consoles_op); -} - -static const struct file_operations proc_consoles_operations = { - .open = consoles_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static int __init proc_consoles_init(void) { - proc_create("consoles", 0, NULL, &proc_consoles_operations); + proc_create_seq("consoles", 0, NULL, &consoles_op); return 0; } fs_initcall(proc_consoles_init); diff --git a/fs/proc/devices.c b/fs/proc/devices.c index 2c7f22b14489..37d38697eaf8 100644 --- a/fs/proc/devices.c +++ b/fs/proc/devices.c @@ -51,21 +51,9 @@ static const struct seq_operations devinfo_ops = { .show = devinfo_show }; -static int devinfo_open(struct inode *inode, struct file *filp) -{ - return seq_open(filp, &devinfo_ops); -} - -static const struct file_operations proc_devinfo_operations = { - .open = devinfo_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static int __init proc_devices_init(void) { - proc_create("devices", 0, NULL, &proc_devinfo_operations); + proc_create_seq("devices", 0, NULL, &devinfo_ops); return 0; } fs_initcall(proc_devices_init); diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 2078e70e1595..02bb1914f5f7 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -25,6 +25,7 @@ #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/uaccess.h> +#include <linux/seq_file.h> #include "internal.h" @@ -346,13 +347,12 @@ static const struct inode_operations proc_dir_inode_operations = { .setattr = proc_notify_change, }; -static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp) +/* returns the registered entry, or frees dp and returns NULL on failure */ +struct proc_dir_entry *proc_register(struct proc_dir_entry *dir, + struct proc_dir_entry *dp) { - int ret; - - ret = proc_alloc_inum(&dp->low_ino); - if (ret) - return ret; + if (proc_alloc_inum(&dp->low_ino)) + goto out_free_entry; write_lock(&proc_subdir_lock); dp->parent = dir; @@ -360,12 +360,16 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp WARN(1, "proc_dir_entry '%s/%s' already registered\n", dir->name, dp->name); write_unlock(&proc_subdir_lock); - proc_free_inum(dp->low_ino); - return -EEXIST; + goto out_free_inum; } write_unlock(&proc_subdir_lock); - return 0; + return dp; +out_free_inum: + proc_free_inum(dp->low_ino); +out_free_entry: + pde_free(dp); + return NULL; } static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, @@ -443,10 +447,7 @@ struct proc_dir_entry *proc_symlink(const char *name, if (ent->data) { strcpy((char*)ent->data,dest); ent->proc_iops = &proc_link_inode_operations; - if (proc_register(parent, ent) < 0) { - pde_free(ent); - ent = NULL; - } + ent = proc_register(parent, ent); } else { pde_free(ent); ent = NULL; @@ -470,11 +471,9 @@ struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode, ent->proc_fops = &proc_dir_operations; ent->proc_iops = &proc_dir_inode_operations; parent->nlink++; - if (proc_register(parent, ent) < 0) { - pde_free(ent); + ent = proc_register(parent, ent); + if (!ent) parent->nlink--; - ent = NULL; - } } return ent; } @@ -505,47 +504,47 @@ struct proc_dir_entry *proc_create_mount_point(const char *name) ent->proc_fops = NULL; ent->proc_iops = NULL; parent->nlink++; - if (proc_register(parent, ent) < 0) { - pde_free(ent); + ent = proc_register(parent, ent); + if (!ent) parent->nlink--; - ent = NULL; - } } return ent; } EXPORT_SYMBOL(proc_create_mount_point); -struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, - struct proc_dir_entry *parent, - const struct file_operations *proc_fops, - void *data) +struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode, + struct proc_dir_entry **parent, void *data) { - struct proc_dir_entry *pde; + struct proc_dir_entry *p; + if ((mode & S_IFMT) == 0) mode |= S_IFREG; - - if (!S_ISREG(mode)) { - WARN_ON(1); /* use proc_mkdir() */ + if ((mode & S_IALLUGO) == 0) + mode |= S_IRUGO; + if (WARN_ON_ONCE(!S_ISREG(mode))) return NULL; + + p = __proc_create(parent, name, mode, 1); + if (p) { + p->proc_iops = &proc_file_inode_operations; + p->data = data; } + return p; +} + +struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, + struct proc_dir_entry *parent, + const struct file_operations *proc_fops, void *data) +{ + struct proc_dir_entry *p; BUG_ON(proc_fops == NULL); - if ((mode & S_IALLUGO) == 0) - mode |= S_IRUGO; - pde = __proc_create(&parent, name, mode, 1); - if (!pde) - goto out; - pde->proc_fops = proc_fops; - pde->data = data; - pde->proc_iops = &proc_file_inode_operations; - if (proc_register(parent, pde) < 0) - goto out_free; - return pde; -out_free: - pde_free(pde); -out: - return NULL; + p = proc_create_reg(name, mode, &parent, data); + if (!p) + return NULL; + p->proc_fops = proc_fops; + return proc_register(parent, p); } EXPORT_SYMBOL(proc_create_data); @@ -557,6 +556,67 @@ struct proc_dir_entry *proc_create(const char *name, umode_t mode, } EXPORT_SYMBOL(proc_create); +static int proc_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *de = PDE(inode); + + if (de->state_size) + return seq_open_private(file, de->seq_ops, de->state_size); + return seq_open(file, de->seq_ops); +} + +static const struct file_operations proc_seq_fops = { + .open = proc_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode, + struct proc_dir_entry *parent, const struct seq_operations *ops, + unsigned int state_size, void *data) +{ + struct proc_dir_entry *p; + + p = proc_create_reg(name, mode, &parent, data); + if (!p) + return NULL; + p->proc_fops = &proc_seq_fops; + p->seq_ops = ops; + p->state_size = state_size; + return proc_register(parent, p); +} +EXPORT_SYMBOL(proc_create_seq_private); + +static int proc_single_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *de = PDE(inode); + + return single_open(file, de->single_show, de->data); +} + +static const struct file_operations proc_single_fops = { + .open = proc_single_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode, + struct proc_dir_entry *parent, + int (*show)(struct seq_file *, void *), void *data) +{ + struct proc_dir_entry *p; + + p = proc_create_reg(name, mode, &parent, data); + if (!p) + return NULL; + p->proc_fops = &proc_single_fops; + p->single_show = show; + return proc_register(parent, p); +} +EXPORT_SYMBOL(proc_create_single_data); + void proc_set_size(struct proc_dir_entry *de, loff_t size) { de->size = size; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 0f1692e63cb6..a318ae5b36b4 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -44,7 +44,12 @@ struct proc_dir_entry { struct completion *pde_unload_completion; const struct inode_operations *proc_iops; const struct file_operations *proc_fops; + union { + const struct seq_operations *seq_ops; + int (*single_show)(struct seq_file *, void *); + }; void *data; + unsigned int state_size; unsigned int low_ino; nlink_t nlink; kuid_t uid; @@ -57,9 +62,9 @@ struct proc_dir_entry { umode_t mode; u8 namelen; #ifdef CONFIG_64BIT -#define SIZEOF_PDE_INLINE_NAME (192-139) +#define SIZEOF_PDE_INLINE_NAME (192-155) #else -#define SIZEOF_PDE_INLINE_NAME (128-87) +#define SIZEOF_PDE_INLINE_NAME (128-95) #endif char inline_name[SIZEOF_PDE_INLINE_NAME]; } __randomize_layout; @@ -162,6 +167,10 @@ extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, i /* * generic.c */ +struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode, + struct proc_dir_entry **parent, void *data); +struct proc_dir_entry *proc_register(struct proc_dir_entry *dir, + struct proc_dir_entry *dp); extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); struct dentry *proc_lookup_de(struct inode *, struct dentry *, struct proc_dir_entry *); extern int proc_readdir(struct file *, struct dir_context *); diff --git a/fs/proc/interrupts.c b/fs/proc/interrupts.c index 6a6bee9c603c..cb0edc7cbf09 100644 --- a/fs/proc/interrupts.c +++ b/fs/proc/interrupts.c @@ -34,21 +34,9 @@ static const struct seq_operations int_seq_ops = { .show = show_interrupts }; -static int interrupts_open(struct inode *inode, struct file *filp) -{ - return seq_open(filp, &int_seq_ops); -} - -static const struct file_operations proc_interrupts_operations = { - .open = interrupts_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static int __init proc_interrupts_init(void) { - proc_create("interrupts", 0, NULL, &proc_interrupts_operations); + proc_create_seq("interrupts", 0, NULL, &int_seq_ops); return 0; } fs_initcall(proc_interrupts_init); diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index d1e82761de81..e64ecb9f2720 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -209,25 +209,34 @@ kclist_add_private(unsigned long pfn, unsigned long nr_pages, void *arg) { struct list_head *head = (struct list_head *)arg; struct kcore_list *ent; + struct page *p; + + if (!pfn_valid(pfn)) + return 1; + + p = pfn_to_page(pfn); + if (!memmap_valid_within(pfn, p, page_zone(p))) + return 1; ent = kmalloc(sizeof(*ent), GFP_KERNEL); if (!ent) return -ENOMEM; - ent->addr = (unsigned long)__va((pfn << PAGE_SHIFT)); + ent->addr = (unsigned long)page_to_virt(p); ent->size = nr_pages << PAGE_SHIFT; - /* Sanity check: Can happen in 32bit arch...maybe */ - if (ent->addr < (unsigned long) __va(0)) + if (!virt_addr_valid(ent->addr)) goto free_out; /* cut not-mapped area. ....from ppc-32 code. */ if (ULONG_MAX - ent->addr < ent->size) ent->size = ULONG_MAX - ent->addr; - /* cut when vmalloc() area is higher than direct-map area */ - if (VMALLOC_START > (unsigned long)__va(0)) { - if (ent->addr > VMALLOC_START) - goto free_out; + /* + * We've already checked virt_addr_valid so we know this address + * is a valid pointer, therefore we can check against it to determine + * if we need to trim + */ + if (VMALLOC_START > ent->addr) { if (VMALLOC_START - ent->addr < ent->size) ent->size = VMALLOC_START - ent->addr; } diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index a000d7547479..d06694757201 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -24,25 +24,13 @@ static int loadavg_proc_show(struct seq_file *m, void *v) LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]), LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]), nr_running(), nr_threads, - idr_get_cursor(&task_active_pid_ns(current)->idr)); + idr_get_cursor(&task_active_pid_ns(current)->idr) - 1); return 0; } -static int loadavg_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, loadavg_proc_show, NULL); -} - -static const struct file_operations loadavg_proc_fops = { - .open = loadavg_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int __init proc_loadavg_init(void) { - proc_create("loadavg", 0, NULL, &loadavg_proc_fops); + proc_create_single("loadavg", 0, NULL, loadavg_proc_show); return 0; } fs_initcall(proc_loadavg_init); diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 65a72ab57471..2fb04846ed11 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -149,21 +149,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) return 0; } -static int meminfo_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, meminfo_proc_show, NULL); -} - -static const struct file_operations meminfo_proc_fops = { - .open = meminfo_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int __init proc_meminfo_init(void) { - proc_create("meminfo", 0, NULL, &meminfo_proc_fops); + proc_create_single("meminfo", 0, NULL, meminfo_proc_show); return 0; } fs_initcall(proc_meminfo_init); diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index 75634379f82e..3b63be64e436 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -113,21 +113,9 @@ static const struct seq_operations proc_nommu_region_list_seqop = { .show = nommu_region_list_show }; -static int proc_nommu_region_list_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &proc_nommu_region_list_seqop); -} - -static const struct file_operations proc_nommu_region_list_operations = { - .open = proc_nommu_region_list_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static int __init proc_nommu_init(void) { - proc_create("maps", S_IRUGO, NULL, &proc_nommu_region_list_operations); + proc_create_seq("maps", S_IRUGO, NULL, &proc_nommu_region_list_seqop); return 0; } diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 1763f370489d..7d94fa005b0d 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -38,20 +38,20 @@ static struct net *get_proc_net(const struct inode *inode) return maybe_get_net(PDE_NET(PDE(inode))); } -int seq_open_net(struct inode *ino, struct file *f, - const struct seq_operations *ops, int size) +static int seq_open_net(struct inode *inode, struct file *file) { - struct net *net; + unsigned int state_size = PDE(inode)->state_size; struct seq_net_private *p; + struct net *net; - BUG_ON(size < sizeof(*p)); + WARN_ON_ONCE(state_size < sizeof(*p)); - net = get_proc_net(ino); - if (net == NULL) + net = get_proc_net(inode); + if (!net) return -ENXIO; - p = __seq_open_private(f, ops, size); - if (p == NULL) { + p = __seq_open_private(file, PDE(inode)->seq_ops, state_size); + if (!p) { put_net(net); return -ENOMEM; } @@ -60,51 +60,83 @@ int seq_open_net(struct inode *ino, struct file *f, #endif return 0; } -EXPORT_SYMBOL_GPL(seq_open_net); -int single_open_net(struct inode *inode, struct file *file, - int (*show)(struct seq_file *, void *)) +static int seq_release_net(struct inode *ino, struct file *f) { - int err; - struct net *net; - - err = -ENXIO; - net = get_proc_net(inode); - if (net == NULL) - goto err_net; - - err = single_open(file, show, net); - if (err < 0) - goto err_open; + struct seq_file *seq = f->private_data; + put_net(seq_file_net(seq)); + seq_release_private(ino, f); return 0; +} -err_open: - put_net(net); -err_net: - return err; +static const struct file_operations proc_net_seq_fops = { + .open = seq_open_net, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode, + struct proc_dir_entry *parent, const struct seq_operations *ops, + unsigned int state_size, void *data) +{ + struct proc_dir_entry *p; + + p = proc_create_reg(name, mode, &parent, data); + if (!p) + return NULL; + p->proc_fops = &proc_net_seq_fops; + p->seq_ops = ops; + p->state_size = state_size; + return proc_register(parent, p); } -EXPORT_SYMBOL_GPL(single_open_net); +EXPORT_SYMBOL_GPL(proc_create_net_data); -int seq_release_net(struct inode *ino, struct file *f) +static int single_open_net(struct inode *inode, struct file *file) { - struct seq_file *seq; + struct proc_dir_entry *de = PDE(inode); + struct net *net; + int err; - seq = f->private_data; + net = get_proc_net(inode); + if (!net) + return -ENXIO; - put_net(seq_file_net(seq)); - seq_release_private(ino, f); - return 0; + err = single_open(file, de->single_show, net); + if (err) + put_net(net); + return err; } -EXPORT_SYMBOL_GPL(seq_release_net); -int single_release_net(struct inode *ino, struct file *f) +static int single_release_net(struct inode *ino, struct file *f) { struct seq_file *seq = f->private_data; put_net(seq->private); return single_release(ino, f); } -EXPORT_SYMBOL_GPL(single_release_net); + +static const struct file_operations proc_net_single_fops = { + .open = single_open_net, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release_net, +}; + +struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode, + struct proc_dir_entry *parent, + int (*show)(struct seq_file *, void *), void *data) +{ + struct proc_dir_entry *p; + + p = proc_create_reg(name, mode, &parent, data); + if (!p) + return NULL; + p->proc_fops = &proc_net_single_fops; + p->single_show = show; + return proc_register(parent, p); +} +EXPORT_SYMBOL_GPL(proc_create_net_single); static struct net *get_proc_task_net(struct inode *dir) { diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c index d0cf1c50bb6c..c69ff191e5d8 100644 --- a/fs/proc/proc_tty.c +++ b/fs/proc/proc_tty.c @@ -126,18 +126,6 @@ static const struct seq_operations tty_drivers_op = { .show = show_tty_driver }; -static int tty_drivers_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &tty_drivers_op); -} - -static const struct file_operations proc_tty_drivers_operations = { - .open = tty_drivers_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - /* * This function is called by tty_register_driver() to handle * registering the driver's /proc handler into /proc/tty/driver/<foo> @@ -147,11 +135,11 @@ void proc_tty_register_driver(struct tty_driver *driver) struct proc_dir_entry *ent; if (!driver->driver_name || driver->proc_entry || - !driver->ops->proc_fops) + !driver->ops->proc_show) return; - ent = proc_create_data(driver->driver_name, 0, proc_tty_driver, - driver->ops->proc_fops, driver); + ent = proc_create_single_data(driver->driver_name, 0, proc_tty_driver, + driver->ops->proc_show, driver); driver->proc_entry = ent; } @@ -186,6 +174,6 @@ void __init proc_tty_init(void) * entry. */ proc_tty_driver = proc_mkdir_mode("tty/driver", S_IRUSR|S_IXUSR, NULL); - proc_create("tty/ldiscs", 0, NULL, &tty_ldiscs_proc_fops); - proc_create("tty/drivers", 0, NULL, &proc_tty_drivers_operations); + proc_create_seq("tty/ldiscs", 0, NULL, &tty_ldiscs_seq_ops); + proc_create_seq("tty/drivers", 0, NULL, &tty_drivers_op); } diff --git a/fs/proc/self.c b/fs/proc/self.c index 4d7d061696b3..127265e5c55f 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -12,7 +12,7 @@ static const char *proc_self_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - struct pid_namespace *ns = inode->i_sb->s_fs_info; + struct pid_namespace *ns = proc_pid_ns(inode); pid_t tgid = task_tgid_nr_ns(current, ns); char *name; @@ -36,7 +36,7 @@ static unsigned self_inum __ro_after_init; int proc_setup_self(struct super_block *s) { struct inode *root_inode = d_inode(s->s_root); - struct pid_namespace *ns = s->s_fs_info; + struct pid_namespace *ns = proc_pid_ns(root_inode); struct dentry *self; inode_lock(root_inode); diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c index 24072cc06e65..12901dcf57e2 100644 --- a/fs/proc/softirqs.c +++ b/fs/proc/softirqs.c @@ -25,21 +25,9 @@ static int show_softirqs(struct seq_file *p, void *v) return 0; } -static int softirqs_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_softirqs, NULL); -} - -static const struct file_operations proc_softirqs_operations = { - .open = softirqs_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int __init proc_softirqs_init(void) { - proc_create("softirqs", 0, NULL, &proc_softirqs_operations); + proc_create_single("softirqs", 0, NULL, show_softirqs); return 0; } fs_initcall(proc_softirqs_init); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 65ae54659833..c486ad4b43f0 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1310,9 +1310,11 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION else if (is_swap_pmd(pmd)) { swp_entry_t entry = pmd_to_swp_entry(pmd); + unsigned long offset = swp_offset(entry); + offset += (addr & ~PMD_MASK) >> PAGE_SHIFT; frame = swp_type(entry) | - (swp_offset(entry) << MAX_SWAPFILES_SHIFT); + (offset << MAX_SWAPFILES_SHIFT); flags |= PM_SWAP; if (pmd_swp_soft_dirty(pmd)) flags |= PM_SOFT_DIRTY; @@ -1332,6 +1334,8 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, break; if (pm->show_pfn && (flags & PM_PRESENT)) frame++; + else if (flags & PM_SWAP) + frame += (1 << MAX_SWAPFILES_SHIFT); } spin_unlock(ptl); return err; diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index 9d2efaca499f..b905010ca9eb 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -12,7 +12,7 @@ static const char *proc_thread_self_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - struct pid_namespace *ns = inode->i_sb->s_fs_info; + struct pid_namespace *ns = proc_pid_ns(inode); pid_t tgid = task_tgid_nr_ns(current, ns); pid_t pid = task_pid_nr_ns(current, ns); char *name; @@ -36,7 +36,7 @@ static unsigned thread_self_inum __ro_after_init; int proc_setup_thread_self(struct super_block *s) { struct inode *root_inode = d_inode(s->s_root); - struct pid_namespace *ns = s->s_fs_info; + struct pid_namespace *ns = proc_pid_ns(root_inode); struct dentry *thread_self; inode_lock(root_inode); diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index 95a708d83721..3bd12f955867 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -30,21 +30,9 @@ static int uptime_proc_show(struct seq_file *m, void *v) return 0; } -static int uptime_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, uptime_proc_show, NULL); -} - -static const struct file_operations uptime_proc_fops = { - .open = uptime_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int __init proc_uptime_init(void) { - proc_create("uptime", 0, NULL, &uptime_proc_fops); + proc_create_single("uptime", 0, NULL, uptime_proc_show); return 0; } fs_initcall(proc_uptime_init); diff --git a/fs/proc/version.c b/fs/proc/version.c index 94901e8e700d..b449f186577f 100644 --- a/fs/proc/version.c +++ b/fs/proc/version.c @@ -15,21 +15,9 @@ static int version_proc_show(struct seq_file *m, void *v) return 0; } -static int version_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, version_proc_show, NULL); -} - -static const struct file_operations version_proc_fops = { - .open = version_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int __init proc_version_init(void) { - proc_create("version", 0, NULL, &version_proc_fops); + proc_create_single("version", 0, NULL, version_proc_show); return 0; } fs_initcall(proc_version_init); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 020c597ef9b6..d88231e3b2be 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2966,7 +2966,7 @@ static int __init dquot_init(void) NULL); order = 0; - dquot_hash = (struct hlist_head *)__get_free_pages(GFP_ATOMIC, order); + dquot_hash = (struct hlist_head *)__get_free_pages(GFP_KERNEL, order); if (!dquot_hash) panic("Cannot create dquot hash table"); diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index bd39a998843d..5089dac02660 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -687,8 +687,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mod reiserfs_update_inode_transaction(inode); reiserfs_update_inode_transaction(dir); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); retval = journal_end(&th); out_failed: @@ -771,8 +770,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode goto out_failed; } - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); retval = journal_end(&th); out_failed: @@ -871,8 +869,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode /* the above add_entry did not update dir's stat data */ reiserfs_update_sd(&th, dir); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); retval = journal_end(&th); out_failed: reiserfs_write_unlock(dir->i_sb); @@ -1187,8 +1184,7 @@ static int reiserfs_symlink(struct inode *parent_dir, goto out_failed; } - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); retval = journal_end(&th); out_failed: reiserfs_write_unlock(parent_dir->i_sb); diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index fe999157dd97..e39b3910d24d 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -389,27 +389,13 @@ static int show_journal(struct seq_file *m, void *unused) return 0; } -static int r_open(struct inode *inode, struct file *file) -{ - return single_open(file, PDE_DATA(inode), - proc_get_parent_data(inode)); -} - -static const struct file_operations r_file_operations = { - .open = r_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static struct proc_dir_entry *proc_info_root = NULL; static const char proc_info_root_name[] = "fs/reiserfs"; static void add_file(struct super_block *sb, char *name, int (*func) (struct seq_file *, void *)) { - proc_create_data(name, 0, REISERFS_SB(sb)->procdir, - &r_file_operations, func); + proc_create_single_data(name, 0, REISERFS_SB(sb)->procdir, func, sb); } int reiserfs_proc_info_init(struct super_block *sb) diff --git a/fs/seq_file.c b/fs/seq_file.c index c6c27f1f9c98..4cc090b50cc5 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -709,11 +709,6 @@ void seq_put_decimal_ull_width(struct seq_file *m, const char *delimiter, if (m->count + width >= m->size) goto overflow; - if (num < 10) { - m->buf[m->count++] = num + '0'; - return; - } - len = num_to_str(m->buf + m->count, m->size - m->count, num, width); if (!len) goto overflow; diff --git a/fs/super.c b/fs/super.c index c9e34cd2b759..50728d9c1a05 100644 --- a/fs/super.c +++ b/fs/super.c @@ -121,13 +121,23 @@ static unsigned long super_cache_count(struct shrinker *shrink, sb = container_of(shrink, struct super_block, s_shrink); /* - * Don't call trylock_super as it is a potential - * scalability bottleneck. The counts could get updated - * between super_cache_count and super_cache_scan anyway. - * Call to super_cache_count with shrinker_rwsem held - * ensures the safety of call to list_lru_shrink_count() and - * s_op->nr_cached_objects(). + * We don't call trylock_super() here as it is a scalability bottleneck, + * so we're exposed to partial setup state. The shrinker rwsem does not + * protect filesystem operations backing list_lru_shrink_count() or + * s_op->nr_cached_objects(). Counts can change between + * super_cache_count and super_cache_scan, so we really don't need locks + * here. + * + * However, if we are currently mounting the superblock, the underlying + * filesystem might be in a state of partial construction and hence it + * is dangerous to access it. trylock_super() uses a SB_BORN check to + * avoid this situation, so do the same here. The memory barrier is + * matched with the one in mount_fs() as we don't hold locks here. */ + if (!(sb->s_flags & SB_BORN)) + return 0; + smp_rmb(); + if (sb->s_op && sb->s_op->nr_cached_objects) total_objects = sb->s_op->nr_cached_objects(sb, sc); @@ -167,6 +177,7 @@ static void destroy_unused_super(struct super_block *s) security_sb_free(s); put_user_ns(s->s_user_ns); kfree(s->s_subtype); + free_prealloced_shrinker(&s->s_shrink); /* no delays needed */ destroy_super_work(&s->destroy_work); } @@ -252,6 +263,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, s->s_shrink.count_objects = super_cache_count; s->s_shrink.batch = 1024; s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE; + if (prealloc_shrinker(&s->s_shrink)) + goto fail; return s; fail: @@ -518,11 +531,7 @@ retry: hlist_add_head(&s->s_instances, &type->fs_supers); spin_unlock(&sb_lock); get_filesystem(type); - err = register_shrinker(&s->s_shrink); - if (err) { - deactivate_locked_super(s); - s = ERR_PTR(err); - } + register_shrinker_prepared(&s->s_shrink); return s; } @@ -1273,6 +1282,14 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data) sb = root->d_sb; BUG_ON(!sb); WARN_ON(!sb->s_bdi); + + /* + * Write barrier is for super_cache_count(). We place it before setting + * SB_BORN as the data dependency between the two functions is the + * superblock structure contents that we just set up, not the SB_BORN + * flag. + */ + smp_wmb(); sb->s_flags |= SB_BORN; error = security_sb_kern_mount(sb, flags, secdata); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index b428d317ae92..92682fcc41f6 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -25,7 +25,7 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, { struct dentry *root; void *ns; - bool new_sb; + bool new_sb = false; if (!(flags & SB_KERNMOUNT)) { if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET)) @@ -35,9 +35,9 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET); root = kernfs_mount_ns(fs_type, flags, sysfs_root, SYSFS_MAGIC, &new_sb, ns); - if (IS_ERR(root) || !new_sb) + if (!new_sb) kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); - else if (new_sb) + else if (!IS_ERR(root)) root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE; return root; diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 0458dd47e105..c586026508db 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -622,8 +622,7 @@ static int udf_add_nondir(struct dentry *dentry, struct inode *inode) if (fibh.sbh != fibh.ebh) brelse(fibh.ebh); brelse(fibh.sbh); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; } @@ -733,8 +732,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inc_nlink(dir); dir->i_ctime = dir->i_mtime = current_time(dir); mark_inode_dirty(dir); - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); if (fibh.sbh != fibh.ebh) brelse(fibh.ebh); brelse(fibh.sbh); diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c index f897e55f2cd0..16a8ad21b77e 100644 --- a/fs/udf/unicode.c +++ b/fs/udf/unicode.c @@ -28,6 +28,9 @@ #include "udf_sb.h" +#define SURROGATE_MASK 0xfffff800 +#define SURROGATE_PAIR 0x0000d800 + static int udf_uni2char_utf8(wchar_t uni, unsigned char *out, int boundlen) @@ -37,6 +40,9 @@ static int udf_uni2char_utf8(wchar_t uni, if (boundlen <= 0) return -ENAMETOOLONG; + if ((uni & SURROGATE_MASK) == SURROGATE_PAIR) + return -EINVAL; + if (uni < 0x80) { out[u_len++] = (unsigned char)uni; } else if (uni < 0x800) { diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 32545cd00ceb..d5f43ba76c59 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -39,8 +39,7 @@ static inline int ufs_add_nondir(struct dentry *dentry, struct inode *inode) { int err = ufs_add_link(dentry, inode); if (!err) { - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; } inode_dec_link_count(inode); @@ -193,8 +192,7 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) if (err) goto out_fail; - unlock_new_inode(inode); - d_instantiate(dentry, inode); + d_instantiate_new(dentry, inode); return 0; out_fail: diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index ce4a34a2751d..35a124400d60 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -511,7 +511,14 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) if (args->flags & ATTR_CREATE) return retval; retval = xfs_attr_shortform_remove(args); - ASSERT(retval == 0); + if (retval) + return retval; + /* + * Since we have removed the old attr, clear ATTR_REPLACE so + * that the leaf format add routine won't trip over the attr + * not being around. + */ + args->flags &= ~ATTR_REPLACE; } if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX || diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 6a7c2f03ea11..040eeda8426f 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -725,12 +725,16 @@ xfs_bmap_extents_to_btree( *logflagsp = 0; if ((error = xfs_alloc_vextent(&args))) { xfs_iroot_realloc(ip, -1, whichfork); + ASSERT(ifp->if_broot == NULL); + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) { xfs_iroot_realloc(ip, -1, whichfork); + ASSERT(ifp->if_broot == NULL); + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return -ENOSPC; } diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index ef68b1de006a..1201107eabc6 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -466,6 +466,8 @@ xfs_dinode_verify( return __this_address; if (di_size > XFS_DFORK_DSIZE(dip, mp)) return __this_address; + if (dip->di_nextents) + return __this_address; /* fall through */ case XFS_DINODE_FMT_EXTENTS: case XFS_DINODE_FMT_BTREE: @@ -484,12 +486,31 @@ xfs_dinode_verify( if (XFS_DFORK_Q(dip)) { switch (dip->di_aformat) { case XFS_DINODE_FMT_LOCAL: + if (dip->di_anextents) + return __this_address; + /* fall through */ case XFS_DINODE_FMT_EXTENTS: case XFS_DINODE_FMT_BTREE: break; default: return __this_address; } + } else { + /* + * If there is no fork offset, this may be a freshly-made inode + * in a new disk cluster, in which case di_aformat is zeroed. + * Otherwise, such an inode must be in EXTENTS format; this goes + * for freed inodes as well. + */ + switch (dip->di_aformat) { + case 0: + case XFS_DINODE_FMT_EXTENTS: + break; + default: + return __this_address; + } + if (dip->di_anextents) + return __this_address; } /* only version 3 or greater inodes are extensively verified here */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 0ab824f574ed..102463543db3 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -594,7 +594,7 @@ xfs_alloc_ioend( struct xfs_ioend *ioend; struct bio *bio; - bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, xfs_ioend_bioset); + bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &xfs_ioend_bioset); xfs_init_bio_from_bh(bio, bh); ioend = container_of(bio, struct xfs_ioend, io_inline_bio); diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 69346d460dfa..694c85b03813 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -18,7 +18,7 @@ #ifndef __XFS_AOPS_H__ #define __XFS_AOPS_H__ -extern struct bio_set *xfs_ioend_bioset; +extern struct bio_set xfs_ioend_bioset; /* * Types of I/O for bmap clustering and I/O completion tracking. diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 299aee4b7b0b..e70fb8ccecea 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -778,22 +778,26 @@ xfs_file_fallocate( if (error) goto out_unlock; } else if (mode & FALLOC_FL_INSERT_RANGE) { - unsigned int blksize_mask = i_blocksize(inode) - 1; + unsigned int blksize_mask = i_blocksize(inode) - 1; + loff_t isize = i_size_read(inode); - new_size = i_size_read(inode) + len; if (offset & blksize_mask || len & blksize_mask) { error = -EINVAL; goto out_unlock; } - /* check the new inode size does not wrap through zero */ - if (new_size > inode->i_sb->s_maxbytes) { + /* + * New inode size must not exceed ->s_maxbytes, accounting for + * possible signed overflow. + */ + if (inode->i_sb->s_maxbytes - isize < len) { error = -EFBIG; goto out_unlock; } + new_size = isize + len; /* Offset should be less than i_size */ - if (offset >= i_size_read(inode)) { + if (offset >= isize) { error = -EINVAL; goto out_unlock; } @@ -876,8 +880,18 @@ xfs_file_dedupe_range( struct file *dst_file, u64 dst_loff) { + struct inode *srci = file_inode(src_file); + u64 max_dedupe; int error; + /* + * Since we have to read all these pages in to compare them, cut + * it off at MAX_RW_COUNT/2 rounded down to the nearest block. + * That means we won't do more than MAX_RW_COUNT IO per request. + */ + max_dedupe = (MAX_RW_COUNT >> 1) & ~(i_blocksize(srci) - 1); + if (len > max_dedupe) + len = max_dedupe; error = xfs_reflink_remap_range(src_file, loff, dst_file, dst_loff, len, true); if (error) diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 056e12b421eb..1cc79907b377 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -113,6 +113,7 @@ void xfs_stats_clearall(struct xfsstats __percpu *stats) } } +#ifdef CONFIG_PROC_FS /* legacy quota interfaces */ #ifdef CONFIG_XFS_QUOTA static int xqm_proc_show(struct seq_file *m, void *v) @@ -124,18 +125,6 @@ static int xqm_proc_show(struct seq_file *m, void *v) return 0; } -static int xqm_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, xqm_proc_show, NULL); -} - -static const struct file_operations xqm_proc_fops = { - .open = xqm_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - /* legacy quota stats interface no 2 */ static int xqmstat_proc_show(struct seq_file *m, void *v) { @@ -147,22 +136,8 @@ static int xqmstat_proc_show(struct seq_file *m, void *v) seq_putc(m, '\n'); return 0; } - -static int xqmstat_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, xqmstat_proc_show, NULL); -} - -static const struct file_operations xqmstat_proc_fops = { - .owner = THIS_MODULE, - .open = xqmstat_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; #endif /* CONFIG_XFS_QUOTA */ -#ifdef CONFIG_PROC_FS int xfs_init_procfs(void) { @@ -174,11 +149,9 @@ xfs_init_procfs(void) goto out; #ifdef CONFIG_XFS_QUOTA - if (!proc_create("fs/xfs/xqmstat", 0, NULL, - &xqmstat_proc_fops)) + if (!proc_create_single("fs/xfs/xqmstat", 0, NULL, xqmstat_proc_show)) goto out; - if (!proc_create("fs/xfs/xqm", 0, NULL, - &xqm_proc_fops)) + if (!proc_create_single("fs/xfs/xqm", 0, NULL, xqm_proc_show)) goto out; #endif return 0; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index d71424052917..f643d76db516 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -63,7 +63,7 @@ #include <linux/parser.h> static const struct super_operations xfs_super_operations; -struct bio_set *xfs_ioend_bioset; +struct bio_set xfs_ioend_bioset; static struct kset *xfs_kset; /* top-level xfs sysfs dir */ #ifdef DEBUG @@ -1845,10 +1845,9 @@ MODULE_ALIAS_FS("xfs"); STATIC int __init xfs_init_zones(void) { - xfs_ioend_bioset = bioset_create(4 * MAX_BUF_PER_PAGE, + if (bioset_init(&xfs_ioend_bioset, 4 * MAX_BUF_PER_PAGE, offsetof(struct xfs_ioend, io_inline_bio), - BIOSET_NEED_BVECS); - if (!xfs_ioend_bioset) + BIOSET_NEED_BVECS)) goto out; xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t), @@ -1997,7 +1996,7 @@ xfs_init_zones(void) out_destroy_log_ticket_zone: kmem_zone_destroy(xfs_log_ticket_zone); out_free_ioend_bioset: - bioset_free(xfs_ioend_bioset); + bioset_exit(&xfs_ioend_bioset); out: return -ENOMEM; } @@ -2029,7 +2028,7 @@ xfs_destroy_zones(void) kmem_zone_destroy(xfs_btree_cur_zone); kmem_zone_destroy(xfs_bmap_free_item_zone); kmem_zone_destroy(xfs_log_ticket_zone); - bioset_free(xfs_ioend_bioset); + bioset_exit(&xfs_ioend_bioset); } STATIC int __init |