diff options
Diffstat (limited to 'fs')
288 files changed, 6960 insertions, 4006 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index aaee1e6584e6..60cd4ba04afc 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -58,7 +58,7 @@ extern const struct file_operations v9fs_mmap_file_operations_dotl; extern struct kmem_cache *v9fs_inode_cache; struct inode *v9fs_alloc_inode(struct super_block *sb); -void v9fs_destroy_inode(struct inode *inode); +void v9fs_free_inode(struct inode *inode); struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t); int v9fs_init_inode(struct v9fs_session_info *v9ses, struct inode *inode, umode_t mode, dev_t); diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 72b779bc0942..24050e866e64 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -253,21 +253,15 @@ struct inode *v9fs_alloc_inode(struct super_block *sb) } /** - * v9fs_destroy_inode - destroy an inode + * v9fs_free_inode - destroy an inode * */ -static void v9fs_i_callback(struct rcu_head *head) +void v9fs_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(v9fs_inode_cache, V9FS_I(inode)); } -void v9fs_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, v9fs_i_callback); -} - int v9fs_init_inode(struct v9fs_session_info *v9ses, struct inode *inode, umode_t mode, dev_t rdev) { diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index d13d35cf69c0..67d1b965adcd 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -344,7 +344,7 @@ static int v9fs_write_inode_dotl(struct inode *inode, static const struct super_operations v9fs_super_ops = { .alloc_inode = v9fs_alloc_inode, - .destroy_inode = v9fs_destroy_inode, + .free_inode = v9fs_free_inode, .statfs = simple_statfs, .evict_inode = v9fs_evict_inode, .show_options = v9fs_show_options, @@ -354,7 +354,7 @@ static const struct super_operations v9fs_super_ops = { static const struct super_operations v9fs_super_ops_dotl = { .alloc_inode = v9fs_alloc_inode, - .destroy_inode = v9fs_destroy_inode, + .free_inode = v9fs_free_inode, .statfs = v9fs_statfs, .drop_inode = v9fs_drop_inode, .evict_inode = v9fs_evict_inode, diff --git a/fs/Makefile b/fs/Makefile index 427fec226fae..35945f8139e6 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -25,7 +25,7 @@ obj-$(CONFIG_PROC_FS) += proc_namespace.o obj-y += notify/ obj-$(CONFIG_EPOLL) += eventpoll.o -obj-$(CONFIG_ANON_INODES) += anon_inodes.o +obj-y += anon_inodes.o obj-$(CONFIG_SIGNALFD) += signalfd.o obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c index 0fbfd0b04ae0..382c9d7ad375 100644 --- a/fs/adfs/dir_f.c +++ b/fs/adfs/dir_f.c @@ -24,8 +24,11 @@ static inline unsigned int adfs_readval(unsigned char *p, int len) switch (len) { case 4: val |= p[3] << 24; + /* fall through */ case 3: val |= p[2] << 16; + /* fall through */ case 2: val |= p[1] << 8; + /* fall through */ default: val |= p[0]; } return val; @@ -35,8 +38,11 @@ static inline void adfs_writeval(unsigned char *p, int len, unsigned int val) { switch (len) { case 4: p[3] = val >> 24; + /* fall through */ case 3: p[2] = val >> 16; + /* fall through */ case 2: p[1] = val >> 8; + /* fall through */ default: p[0] = val; } } diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 7e099a7a4eb1..2a83655c408f 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -248,17 +248,11 @@ static struct inode *adfs_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void adfs_i_callback(struct rcu_head *head) +static void adfs_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(adfs_inode_cachep, ADFS_I(inode)); } -static void adfs_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, adfs_i_callback); -} - static void init_once(void *foo) { struct adfs_inode_info *ei = (struct adfs_inode_info *) foo; @@ -290,7 +284,7 @@ static void destroy_inodecache(void) static const struct super_operations adfs_sops = { .alloc_inode = adfs_alloc_inode, - .destroy_inode = adfs_destroy_inode, + .free_inode = adfs_free_inode, .drop_inode = generic_delete_inode, .write_inode = adfs_write_inode, .put_super = adfs_put_super, diff --git a/fs/affs/super.c b/fs/affs/super.c index d1ad11a8a4a5..7370228eefb2 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -111,17 +111,11 @@ static struct inode *affs_alloc_inode(struct super_block *sb) return &i->vfs_inode; } -static void affs_i_callback(struct rcu_head *head) +static void affs_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(affs_inode_cachep, AFFS_I(inode)); } -static void affs_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, affs_i_callback); -} - static void init_once(void *foo) { struct affs_inode_info *ei = (struct affs_inode_info *) foo; @@ -155,7 +149,7 @@ static void destroy_inodecache(void) static const struct super_operations affs_sops = { .alloc_inode = affs_alloc_inode, - .destroy_inode = affs_destroy_inode, + .free_inode = affs_free_inode, .write_inode = affs_write_inode, .evict_inode = affs_evict_inode, .put_super = affs_put_super, @@ -487,7 +481,7 @@ got_root: break; case MUFS_OFS: affs_set_opt(sbi->s_flags, SF_MUFS); - /* fall thru */ + /* fall through */ case FS_OFS: affs_set_opt(sbi->s_flags, SF_OFS); sb->s_flags |= SB_NOEXEC; @@ -495,6 +489,7 @@ got_root: case MUFS_DCOFS: case MUFS_INTLOFS: affs_set_opt(sbi->s_flags, SF_MUFS); + /* fall through */ case FS_DCOFS: case FS_INTLOFS: affs_set_opt(sbi->s_flags, SF_INTL); diff --git a/fs/afs/callback.c b/fs/afs/callback.c index 1c7955f5cdaf..128f2dbe256a 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -203,8 +203,7 @@ void afs_put_cb_interest(struct afs_net *net, struct afs_cb_interest *cbi) */ void afs_init_callback_state(struct afs_server *server) { - if (!test_and_clear_bit(AFS_SERVER_FL_NEW, &server->flags)) - server->cb_s_break++; + server->cb_s_break++; } /* diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 8ee5972893ed..748090014519 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -34,7 +34,7 @@ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *); static int afs_deliver_yfs_cb_callback(struct afs_call *); #define CM_NAME(name) \ - const char afs_SRXCB##name##_name[] __tracepoint_string = \ + char afs_SRXCB##name##_name[] __tracepoint_string = \ "CB." #name /* @@ -285,6 +285,7 @@ static int afs_deliver_cb_callback(struct afs_call *call) call->unmarshall++; /* extract the FID array and its count in two steps */ + /* fall through */ case 1: _debug("extract FID count"); ret = afs_extract_data(call, true); @@ -304,6 +305,7 @@ static int afs_deliver_cb_callback(struct afs_call *call) afs_extract_to_buf(call, call->count * 3 * 4); call->unmarshall++; + /* Fall through */ case 2: _debug("extract FID array"); ret = afs_extract_data(call, true); @@ -329,6 +331,7 @@ static int afs_deliver_cb_callback(struct afs_call *call) call->unmarshall++; /* extract the callback array and its count in two steps */ + /* fall through */ case 3: _debug("extract CB count"); ret = afs_extract_data(call, true); @@ -344,6 +347,7 @@ static int afs_deliver_cb_callback(struct afs_call *call) iov_iter_discard(&call->iter, READ, call->count2 * 3 * 4); call->unmarshall++; + /* Fall through */ case 4: _debug("extract discard %zu/%u", iov_iter_count(&call->iter), call->count2 * 3 * 4); @@ -422,6 +426,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) afs_extract_to_buf(call, 11 * sizeof(__be32)); call->unmarshall++; + /* Fall through */ case 1: _debug("extract UUID"); ret = afs_extract_data(call, false); @@ -537,6 +542,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call) afs_extract_to_buf(call, 11 * sizeof(__be32)); call->unmarshall++; + /* Fall through */ case 1: _debug("extract UUID"); ret = afs_extract_data(call, false); @@ -673,6 +679,7 @@ static int afs_deliver_yfs_cb_callback(struct afs_call *call) call->unmarshall++; /* extract the FID array and its count in two steps */ + /* Fall through */ case 1: _debug("extract FID count"); ret = afs_extract_data(call, true); @@ -692,6 +699,7 @@ static int afs_deliver_yfs_cb_callback(struct afs_call *call) afs_extract_to_buf(call, size); call->unmarshall++; + /* Fall through */ case 2: _debug("extract FID array"); ret = afs_extract_data(call, false); diff --git a/fs/afs/file.c b/fs/afs/file.c index 323ae9912203..e8d6619890a9 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -300,6 +300,8 @@ int afs_page_filler(void *data, struct page *page) /* page will not be cached */ case -ENOBUFS: _debug("cache said ENOBUFS"); + + /* fall through */ default: go_on: req = kzalloc(sizeof(struct afs_read) + sizeof(struct page *), diff --git a/fs/afs/flock.c b/fs/afs/flock.c index e432bd27a2e7..6a0174258382 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -303,6 +303,7 @@ again: return; } + /* Fall through */ default: /* Looks like a lock request was withdrawn. */ spin_unlock(&vnode->lock); diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 0b37867b5c20..b68471ce5c35 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -498,7 +498,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) afs_extract_to_tmp(call); } - /* extract the returned data length */ + /* Fall through - and extract the returned data length */ case 1: _debug("extract data length"); ret = afs_extract_data(call, true); @@ -525,7 +525,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) iov_iter_bvec(&call->iter, READ, call->bvec, 1, size); ASSERTCMP(size, <=, PAGE_SIZE); - /* extract the returned data */ + /* Fall through - and extract the returned data */ case 2: _debug("extract data %zu/%llu", iov_iter_count(&call->iter), req->remain); @@ -552,6 +552,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) /* Discard any excess data the server gave us */ iov_iter_discard(&call->iter, READ, req->actual_len - req->len); call->unmarshall = 3; + + /* Fall through */ case 3: _debug("extract discard %zu/%llu", iov_iter_count(&call->iter), req->actual_len - req->len); @@ -564,7 +566,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) call->unmarshall = 4; afs_extract_to_buf(call, (21 + 3 + 6) * 4); - /* extract the metadata */ + /* Fall through - and extract the metadata */ case 4: ret = afs_extract_data(call, false); if (ret < 0) @@ -1634,7 +1636,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) call->unmarshall++; afs_extract_to_buf(call, 12 * 4); - /* extract the returned status record */ + /* Fall through - and extract the returned status record */ case 1: _debug("extract status"); ret = afs_extract_data(call, true); @@ -1646,7 +1648,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) call->unmarshall++; afs_extract_to_tmp(call); - /* extract the volume name length */ + /* Fall through - and extract the volume name length */ case 2: ret = afs_extract_data(call, true); if (ret < 0) @@ -1661,7 +1663,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) afs_extract_begin(call, call->reply[2], size); call->unmarshall++; - /* extract the volume name */ + /* Fall through - and extract the volume name */ case 3: _debug("extract volname"); ret = afs_extract_data(call, true); @@ -1674,7 +1676,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) afs_extract_to_tmp(call); call->unmarshall++; - /* extract the offline message length */ + /* Fall through - and extract the offline message length */ case 4: ret = afs_extract_data(call, true); if (ret < 0) @@ -1689,7 +1691,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) afs_extract_begin(call, call->reply[2], size); call->unmarshall++; - /* extract the offline message */ + /* Fall through - and extract the offline message */ case 5: _debug("extract offline"); ret = afs_extract_data(call, true); @@ -1703,7 +1705,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) afs_extract_to_tmp(call); call->unmarshall++; - /* extract the message of the day length */ + /* Fall through - and extract the message of the day length */ case 6: ret = afs_extract_data(call, true); if (ret < 0) @@ -1718,7 +1720,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) afs_extract_begin(call, call->reply[2], size); call->unmarshall++; - /* extract the message of the day */ + /* Fall through - and extract the message of the day */ case 7: _debug("extract motd"); ret = afs_extract_data(call, false); @@ -2016,7 +2018,7 @@ static int afs_deliver_fs_get_capabilities(struct afs_call *call) afs_extract_to_tmp(call); call->unmarshall++; - /* Extract the capabilities word count */ + /* Fall through - and extract the capabilities word count */ case 1: ret = afs_extract_data(call, true); if (ret < 0) @@ -2029,7 +2031,7 @@ static int afs_deliver_fs_get_capabilities(struct afs_call *call) iov_iter_discard(&call->iter, READ, count * sizeof(__be32)); call->unmarshall++; - /* Extract capabilities words */ + /* Fall through - and extract capabilities words */ case 2: ret = afs_extract_data(call, false); if (ret < 0) @@ -2206,6 +2208,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) call->unmarshall++; /* Extract the file status count and array in two steps */ + /* Fall through */ case 1: _debug("extract status count"); ret = afs_extract_data(call, true); @@ -2223,6 +2226,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) more_counts: afs_extract_to_buf(call, 21 * sizeof(__be32)); + /* Fall through */ case 2: _debug("extract status array %u", call->count); ret = afs_extract_data(call, true); @@ -2246,6 +2250,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) afs_extract_to_tmp(call); /* Extract the callback count and array in two steps */ + /* Fall through */ case 3: _debug("extract CB count"); ret = afs_extract_data(call, true); @@ -2262,6 +2267,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) more_cbs: afs_extract_to_buf(call, 3 * sizeof(__be32)); + /* Fall through */ case 4: _debug("extract CB array"); ret = afs_extract_data(call, true); @@ -2284,6 +2290,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) afs_extract_to_buf(call, 6 * sizeof(__be32)); call->unmarshall++; + /* Fall through */ case 5: ret = afs_extract_data(call, false); if (ret < 0) diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 1a4ce07fb406..9cedc3fc1b77 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -216,9 +216,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) set_nlink(inode, 2); inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; - inode->i_ctime.tv_sec = get_seconds(); - inode->i_ctime.tv_nsec = 0; - inode->i_atime = inode->i_mtime = inode->i_ctime; + inode->i_ctime = inode->i_atime = inode->i_mtime = current_time(inode); inode->i_blocks = 0; inode_set_iversion_raw(inode, 0); inode->i_generation = 0; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index bb1f244b2b3a..3904ab0b9563 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -474,7 +474,6 @@ struct afs_server { time64_t put_time; /* Time at which last put */ time64_t update_at; /* Time at which to next update the record */ unsigned long flags; -#define AFS_SERVER_FL_NEW 0 /* New server, don't inc cb_s_break */ #define AFS_SERVER_FL_NOT_READY 1 /* The record is not ready for use */ #define AFS_SERVER_FL_NOT_FOUND 2 /* VL server says no such server */ #define AFS_SERVER_FL_VL_FAIL 3 /* Failed to access VL server */ @@ -827,7 +826,7 @@ static inline struct afs_cb_interest *afs_get_cb_interest(struct afs_cb_interest static inline unsigned int afs_calc_vnode_cb_break(struct afs_vnode *vnode) { - return vnode->cb_break + vnode->cb_s_break + vnode->cb_v_break; + return vnode->cb_break + vnode->cb_v_break; } static inline bool afs_cb_is_broken(unsigned int cb_break, @@ -835,7 +834,6 @@ static inline bool afs_cb_is_broken(unsigned int cb_break, const struct afs_cb_interest *cbi) { return !cbi || cb_break != (vnode->cb_break + - cbi->server->cb_s_break + vnode->volume->cb_v_break); } diff --git a/fs/afs/misc.c b/fs/afs/misc.c index bbb1fd51b019..7f2af061ea06 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -131,33 +131,42 @@ void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code) if (e->error == -ETIMEDOUT || e->error == -ETIME) return; + /* Fall through */ case -ETIMEDOUT: case -ETIME: if (e->error == -ENOMEM || e->error == -ENONET) return; + /* Fall through */ case -ENOMEM: case -ENONET: if (e->error == -ERFKILL) return; + /* Fall through */ case -ERFKILL: if (e->error == -EADDRNOTAVAIL) return; + /* Fall through */ case -EADDRNOTAVAIL: if (e->error == -ENETUNREACH) return; + /* Fall through */ case -ENETUNREACH: if (e->error == -EHOSTUNREACH) return; + /* Fall through */ case -EHOSTUNREACH: if (e->error == -EHOSTDOWN) return; + /* Fall through */ case -EHOSTDOWN: if (e->error == -ECONNREFUSED) return; + /* Fall through */ case -ECONNREFUSED: if (e->error == -ECONNRESET) return; + /* Fall through */ case -ECONNRESET: /* Responded, but call expired. */ if (e->responded) return; diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 2c588f9bbbda..3ed2c99c58ab 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -572,13 +572,17 @@ static void afs_deliver_to_call(struct afs_call *call) case -ENODATA: case -EBADMSG: case -EMSGSIZE: - default: abort_code = RXGEN_CC_UNMARSHAL; if (state != AFS_CALL_CL_AWAIT_REPLY) abort_code = RXGEN_SS_UNMARSHAL; rxrpc_kernel_abort_call(call->net->socket, call->rxcall, abort_code, ret, "KUM"); goto local_abort; + default: + abort_code = RX_USER_ABORT; + rxrpc_kernel_abort_call(call->net->socket, call->rxcall, + abort_code, ret, "KER"); + goto local_abort; } } @@ -610,6 +614,7 @@ static long afs_wait_for_call_to_complete(struct afs_call *call, bool stalled = false; u64 rtt; u32 life, last_life; + bool rxrpc_complete = false; DECLARE_WAITQUEUE(myself, current); @@ -621,7 +626,7 @@ static long afs_wait_for_call_to_complete(struct afs_call *call, rtt2 = 2; timeout = rtt2; - last_life = rxrpc_kernel_check_life(call->net->socket, call->rxcall); + rxrpc_kernel_check_life(call->net->socket, call->rxcall, &last_life); add_wait_queue(&call->waitq, &myself); for (;;) { @@ -639,7 +644,12 @@ static long afs_wait_for_call_to_complete(struct afs_call *call, if (afs_check_call_state(call, AFS_CALL_COMPLETE)) break; - life = rxrpc_kernel_check_life(call->net->socket, call->rxcall); + if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall, &life)) { + /* rxrpc terminated the call. */ + rxrpc_complete = true; + break; + } + if (timeout == 0 && life == last_life && signal_pending(current)) { if (stalled) @@ -663,12 +673,16 @@ static long afs_wait_for_call_to_complete(struct afs_call *call, remove_wait_queue(&call->waitq, &myself); __set_current_state(TASK_RUNNING); - /* Kill off the call if it's still live. */ if (!afs_check_call_state(call, AFS_CALL_COMPLETE)) { - _debug("call interrupted"); - if (rxrpc_kernel_abort_call(call->net->socket, call->rxcall, - RX_USER_ABORT, -EINTR, "KWI")) - afs_set_call_complete(call, -EINTR, 0); + if (rxrpc_complete) { + afs_set_call_complete(call, call->error, call->abort_code); + } else { + /* Kill off the call if it's still live. */ + _debug("call interrupted"); + if (rxrpc_kernel_abort_call(call->net->socket, call->rxcall, + RX_USER_ABORT, -EINTR, "KWI")) + afs_set_call_complete(call, -EINTR, 0); + } } spin_lock_bh(&call->state_lock); @@ -909,6 +923,7 @@ void afs_send_empty_reply(struct afs_call *call) _debug("oom"); rxrpc_kernel_abort_call(net->socket, call->rxcall, RX_USER_ABORT, -ENOMEM, "KOO"); + /* Fall through */ default: _leave(" [error]"); return; diff --git a/fs/afs/server.c b/fs/afs/server.c index 642afa2e9783..65b33b6da48b 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -226,7 +226,6 @@ static struct afs_server *afs_alloc_server(struct afs_net *net, RCU_INIT_POINTER(server->addresses, alist); server->addr_version = alist->version; server->uuid = *uuid; - server->flags = (1UL << AFS_SERVER_FL_NEW); server->update_at = ktime_get_real_seconds() + afs_server_update_delay; rwlock_init(&server->fs_lock); INIT_HLIST_HEAD(&server->cb_volumes); diff --git a/fs/afs/super.c b/fs/afs/super.c index 5adf012b8e27..bab89763119b 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -33,6 +33,7 @@ static void afs_i_init_once(void *foo); static void afs_kill_super(struct super_block *sb); static struct inode *afs_alloc_inode(struct super_block *sb); static void afs_destroy_inode(struct inode *inode); +static void afs_free_inode(struct inode *inode); static int afs_statfs(struct dentry *dentry, struct kstatfs *buf); static int afs_show_devname(struct seq_file *m, struct dentry *root); static int afs_show_options(struct seq_file *m, struct dentry *root); @@ -56,6 +57,7 @@ static const struct super_operations afs_super_ops = { .alloc_inode = afs_alloc_inode, .drop_inode = afs_drop_inode, .destroy_inode = afs_destroy_inode, + .free_inode = afs_free_inode, .evict_inode = afs_evict_inode, .show_devname = afs_show_devname, .show_options = afs_show_options, @@ -660,11 +662,9 @@ static struct inode *afs_alloc_inode(struct super_block *sb) return &vnode->vfs_inode; } -static void afs_i_callback(struct rcu_head *head) +static void afs_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); - struct afs_vnode *vnode = AFS_FS_I(inode); - kmem_cache_free(afs_inode_cachep, vnode); + kmem_cache_free(afs_inode_cachep, AFS_FS_I(inode)); } /* @@ -680,7 +680,6 @@ static void afs_destroy_inode(struct inode *inode) ASSERTCMP(vnode->cb_interest, ==, NULL); - call_rcu(&inode->i_rcu, afs_i_callback); atomic_dec(&afs_count_active_inodes); } diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index c3d9e5a5f67e..b0175b3ef0e8 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -195,7 +195,9 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call) sizeof(struct afs_uuid__xdr) + 3 * sizeof(__be32)); call->unmarshall++; - /* Extract the returned uuid, uniquifier, nentries and blkaddrs size */ + /* Extract the returned uuid, uniquifier, nentries and + * blkaddrs size */ + /* Fall through */ case 1: ret = afs_extract_data(call, true); if (ret < 0) @@ -220,7 +222,7 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call) count = min(call->count, 4U); afs_extract_to_buf(call, count * sizeof(__be32)); - /* Extract entries */ + /* Fall through - and extract entries */ case 2: ret = afs_extract_data(call, call->count > 4); if (ret < 0) @@ -323,7 +325,7 @@ static int afs_deliver_vl_get_capabilities(struct afs_call *call) afs_extract_to_tmp(call); call->unmarshall++; - /* Extract the capabilities word count */ + /* Fall through - and extract the capabilities word count */ case 1: ret = afs_extract_data(call, true); if (ret < 0) @@ -336,7 +338,7 @@ static int afs_deliver_vl_get_capabilities(struct afs_call *call) call->unmarshall++; afs_extract_discard(call, count * sizeof(__be32)); - /* Extract capabilities words */ + /* Fall through - and extract capabilities words */ case 2: ret = afs_extract_data(call, false); if (ret < 0) @@ -436,6 +438,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) /* Extract the returned uuid, uniquifier, fsEndpoints count and * either the first fsEndpoint type or the volEndpoints * count if there are no fsEndpoints. */ + /* Fall through */ case 1: ret = afs_extract_data(call, true); if (ret < 0) @@ -476,7 +479,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) afs_extract_to_buf(call, size); call->unmarshall = 2; - /* Extract fsEndpoints[] entries */ + /* Fall through - and extract fsEndpoints[] entries */ case 2: ret = afs_extract_data(call, true); if (ret < 0) @@ -529,6 +532,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) * extract the type of the next endpoint when we extract the * data of the current one, but this is the first... */ + /* Fall through */ case 3: ret = afs_extract_data(call, true); if (ret < 0) @@ -555,7 +559,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) afs_extract_to_buf(call, size); call->unmarshall = 4; - /* Extract volEndpoints[] entries */ + /* Fall through - and extract volEndpoints[] entries */ case 4: ret = afs_extract_data(call, true); if (ret < 0) @@ -591,7 +595,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) afs_extract_discard(call, 0); call->unmarshall = 5; - /* Done */ + /* Fall through - Done */ case 5: ret = afs_extract_data(call, false); if (ret < 0) diff --git a/fs/afs/write.c b/fs/afs/write.c index 72efcfcf9f95..0122d7445fba 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -264,6 +264,7 @@ static void afs_kill_pages(struct address_space *mapping, first = page->index + 1; lock_page(page); generic_error_remove_page(mapping, page); + unlock_page(page); } __pagevec_release(&pv); diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index 6e97a42d24d1..871e29f06257 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -544,7 +544,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) afs_extract_to_tmp64(call); call->unmarshall++; - /* extract the returned data length */ + /* Fall through - and extract the returned data length */ case 1: _debug("extract data length"); ret = afs_extract_data(call, true); @@ -571,7 +571,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) iov_iter_bvec(&call->iter, READ, call->bvec, 1, size); ASSERTCMP(size, <=, PAGE_SIZE); - /* extract the returned data */ + /* Fall through - and extract the returned data */ case 2: _debug("extract data %zu/%llu", iov_iter_count(&call->iter), req->remain); @@ -598,6 +598,8 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) /* Discard any excess data the server gave us */ iov_iter_discard(&call->iter, READ, req->actual_len - req->len); call->unmarshall = 3; + + /* Fall through */ case 3: _debug("extract discard %zu/%llu", iov_iter_count(&call->iter), req->actual_len - req->len); @@ -613,7 +615,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) sizeof(struct yfs_xdr_YFSCallBack) + sizeof(struct yfs_xdr_YFSVolSync)); - /* extract the metadata */ + /* Fall through - and extract the metadata */ case 4: ret = afs_extract_data(call, false); if (ret < 0) @@ -629,6 +631,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) call->unmarshall++; + /* Fall through */ case 5: break; } @@ -1584,7 +1587,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call) call->unmarshall++; afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSFetchVolumeStatus)); - /* extract the returned status record */ + /* Fall through - and extract the returned status record */ case 1: _debug("extract status"); ret = afs_extract_data(call, true); @@ -1596,7 +1599,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call) call->unmarshall++; afs_extract_to_tmp(call); - /* extract the volume name length */ + /* Fall through - and extract the volume name length */ case 2: ret = afs_extract_data(call, true); if (ret < 0) @@ -1611,7 +1614,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call) afs_extract_begin(call, call->reply[2], size); call->unmarshall++; - /* extract the volume name */ + /* Fall through - and extract the volume name */ case 3: _debug("extract volname"); ret = afs_extract_data(call, true); @@ -1624,7 +1627,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call) afs_extract_to_tmp(call); call->unmarshall++; - /* extract the offline message length */ + /* Fall through - and extract the offline message length */ case 4: ret = afs_extract_data(call, true); if (ret < 0) @@ -1639,7 +1642,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call) afs_extract_begin(call, call->reply[2], size); call->unmarshall++; - /* extract the offline message */ + /* Fall through - and extract the offline message */ case 5: _debug("extract offline"); ret = afs_extract_data(call, true); @@ -1653,7 +1656,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call) afs_extract_to_tmp(call); call->unmarshall++; - /* extract the message of the day length */ + /* Fall through - and extract the message of the day length */ case 6: ret = afs_extract_data(call, true); if (ret < 0) @@ -1668,7 +1671,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call) afs_extract_begin(call, call->reply[2], size); call->unmarshall++; - /* extract the message of the day */ + /* Fall through - and extract the message of the day */ case 7: _debug("extract motd"); ret = afs_extract_data(call, false); @@ -1681,6 +1684,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call) call->unmarshall++; + /* Fall through */ case 8: break; } @@ -2026,6 +2030,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call) call->unmarshall++; /* Extract the file status count and array in two steps */ + /* Fall through */ case 1: _debug("extract status count"); ret = afs_extract_data(call, true); @@ -2043,6 +2048,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call) more_counts: afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSFetchStatus)); + /* Fall through */ case 2: _debug("extract status array %u", call->count); ret = afs_extract_data(call, true); @@ -2066,6 +2072,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call) afs_extract_to_tmp(call); /* Extract the callback count and array in two steps */ + /* Fall through */ case 3: _debug("extract CB count"); ret = afs_extract_data(call, true); @@ -2082,6 +2089,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call) more_cbs: afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSCallBack)); + /* Fall through */ case 4: _debug("extract CB array"); ret = afs_extract_data(call, true); @@ -2104,6 +2112,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call) afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSVolSync)); call->unmarshall++; + /* Fall through */ case 5: ret = afs_extract_data(call, false); if (ret < 0) @@ -2114,6 +2123,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call) call->unmarshall++; + /* Fall through */ case 6: break; } @@ -181,7 +181,7 @@ struct poll_iocb { struct file *file; struct wait_queue_head *head; __poll_t events; - bool woken; + bool done; bool cancelled; struct wait_queue_entry wait; struct work_struct work; @@ -204,8 +204,7 @@ struct aio_kiocb { struct kioctx *ki_ctx; kiocb_cancel_fn *ki_cancel; - struct iocb __user *ki_user_iocb; /* user's aiocb */ - __u64 ki_user_data; /* user's data for completion */ + struct io_event ki_res; struct list_head ki_list; /* the aio core uses this * for cancellation */ @@ -1022,6 +1021,9 @@ static bool get_reqs_available(struct kioctx *ctx) /* aio_get_req * Allocate a slot for an aio request. * Returns NULL if no requests are free. + * + * The refcount is initialized to 2 - one for the async op completion, + * one for the synchronous code that does this. */ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx) { @@ -1031,10 +1033,15 @@ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx) if (unlikely(!req)) return NULL; + if (unlikely(!get_reqs_available(ctx))) { + kmem_cache_free(kiocb_cachep, req); + return NULL; + } + percpu_ref_get(&ctx->reqs); req->ki_ctx = ctx; INIT_LIST_HEAD(&req->ki_list); - refcount_set(&req->ki_refcnt, 0); + refcount_set(&req->ki_refcnt, 2); req->ki_eventfd = NULL; return req; } @@ -1067,30 +1074,20 @@ out: return ret; } -static inline void iocb_put(struct aio_kiocb *iocb) -{ - if (refcount_read(&iocb->ki_refcnt) == 0 || - refcount_dec_and_test(&iocb->ki_refcnt)) { - if (iocb->ki_filp) - fput(iocb->ki_filp); - percpu_ref_put(&iocb->ki_ctx->reqs); - kmem_cache_free(kiocb_cachep, iocb); - } -} - -static void aio_fill_event(struct io_event *ev, struct aio_kiocb *iocb, - long res, long res2) +static inline void iocb_destroy(struct aio_kiocb *iocb) { - ev->obj = (u64)(unsigned long)iocb->ki_user_iocb; - ev->data = iocb->ki_user_data; - ev->res = res; - ev->res2 = res2; + if (iocb->ki_eventfd) + eventfd_ctx_put(iocb->ki_eventfd); + if (iocb->ki_filp) + fput(iocb->ki_filp); + percpu_ref_put(&iocb->ki_ctx->reqs); + kmem_cache_free(kiocb_cachep, iocb); } /* aio_complete * Called when the io request on the given iocb is complete. */ -static void aio_complete(struct aio_kiocb *iocb, long res, long res2) +static void aio_complete(struct aio_kiocb *iocb) { struct kioctx *ctx = iocb->ki_ctx; struct aio_ring *ring; @@ -1114,14 +1111,14 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2) ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); event = ev_page + pos % AIO_EVENTS_PER_PAGE; - aio_fill_event(event, iocb, res, res2); + *event = iocb->ki_res; kunmap_atomic(ev_page); flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); - pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n", - ctx, tail, iocb, iocb->ki_user_iocb, iocb->ki_user_data, - res, res2); + pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb, + (void __user *)(unsigned long)iocb->ki_res.obj, + iocb->ki_res.data, iocb->ki_res.res, iocb->ki_res.res2); /* after flagging the request as done, we * must never even look at it again @@ -1148,10 +1145,8 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2) * eventfd. The eventfd_signal() function is safe to be called * from IRQ context. */ - if (iocb->ki_eventfd) { + if (iocb->ki_eventfd) eventfd_signal(iocb->ki_eventfd, 1); - eventfd_ctx_put(iocb->ki_eventfd); - } /* * We have to order our ring_info tail store above and test @@ -1163,7 +1158,14 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2) if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); - iocb_put(iocb); +} + +static inline void iocb_put(struct aio_kiocb *iocb) +{ + if (refcount_dec_and_test(&iocb->ki_refcnt)) { + aio_complete(iocb); + iocb_destroy(iocb); + } } /* aio_read_events_ring @@ -1437,7 +1439,9 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2) file_end_write(kiocb->ki_filp); } - aio_complete(iocb, res, res2); + iocb->ki_res.res = res; + iocb->ki_res.res2 = res2; + iocb_put(iocb); } static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) @@ -1514,13 +1518,13 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t ret) } } -static ssize_t aio_read(struct kiocb *req, const struct iocb *iocb, +static int aio_read(struct kiocb *req, const struct iocb *iocb, bool vectored, bool compat) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; struct file *file; - ssize_t ret; + int ret; ret = aio_prep_rw(req, iocb); if (ret) @@ -1542,13 +1546,13 @@ static ssize_t aio_read(struct kiocb *req, const struct iocb *iocb, return ret; } -static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb, +static int aio_write(struct kiocb *req, const struct iocb *iocb, bool vectored, bool compat) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; struct file *file; - ssize_t ret; + int ret; ret = aio_prep_rw(req, iocb); if (ret) @@ -1585,11 +1589,10 @@ static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb, static void aio_fsync_work(struct work_struct *work) { - struct fsync_iocb *req = container_of(work, struct fsync_iocb, work); - int ret; + struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work); - ret = vfs_fsync(req->file, req->datasync); - aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0); + iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync); + iocb_put(iocb); } static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb, @@ -1608,11 +1611,6 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb, return 0; } -static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask) -{ - aio_complete(iocb, mangle_poll(mask), 0); -} - static void aio_poll_complete_work(struct work_struct *work) { struct poll_iocb *req = container_of(work, struct poll_iocb, work); @@ -1638,9 +1636,11 @@ static void aio_poll_complete_work(struct work_struct *work) return; } list_del_init(&iocb->ki_list); + iocb->ki_res.res = mangle_poll(mask); + req->done = true; spin_unlock_irq(&ctx->ctx_lock); - aio_poll_complete(iocb, mask); + iocb_put(iocb); } /* assumes we are called with irqs disabled */ @@ -1668,31 +1668,27 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, __poll_t mask = key_to_poll(key); unsigned long flags; - req->woken = true; - /* for instances that support it check for an event match first: */ - if (mask) { - if (!(mask & req->events)) - return 0; + if (mask && !(mask & req->events)) + return 0; + + list_del_init(&req->wait.entry); + if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) { /* * Try to complete the iocb inline if we can. Use * irqsave/irqrestore because not all filesystems (e.g. fuse) * call this function with IRQs disabled and because IRQs * have to be disabled before ctx_lock is obtained. */ - if (spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) { - list_del(&iocb->ki_list); - spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags); - - list_del_init(&req->wait.entry); - aio_poll_complete(iocb, mask); - return 1; - } + list_del(&iocb->ki_list); + iocb->ki_res.res = mangle_poll(mask); + req->done = true; + spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags); + iocb_put(iocb); + } else { + schedule_work(&req->work); } - - list_del_init(&req->wait.entry); - schedule_work(&req->work); return 1; } @@ -1719,11 +1715,12 @@ aio_poll_queue_proc(struct file *file, struct wait_queue_head *head, add_wait_queue(head, &pt->iocb->poll.wait); } -static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) +static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) { struct kioctx *ctx = aiocb->ki_ctx; struct poll_iocb *req = &aiocb->poll; struct aio_poll_table apt; + bool cancel = false; __poll_t mask; /* reject any unknown events outside the normal event mask. */ @@ -1737,7 +1734,7 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP; req->head = NULL; - req->woken = false; + req->done = false; req->cancelled = false; apt.pt._qproc = aio_poll_queue_proc; @@ -1749,156 +1746,135 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) INIT_LIST_HEAD(&req->wait.entry); init_waitqueue_func_entry(&req->wait, aio_poll_wake); - /* one for removal from waitqueue, one for this function */ - refcount_set(&aiocb->ki_refcnt, 2); - mask = vfs_poll(req->file, &apt.pt) & req->events; - if (unlikely(!req->head)) { - /* we did not manage to set up a waitqueue, done */ - goto out; - } - spin_lock_irq(&ctx->ctx_lock); - spin_lock(&req->head->lock); - if (req->woken) { - /* wake_up context handles the rest */ - mask = 0; + if (likely(req->head)) { + spin_lock(&req->head->lock); + if (unlikely(list_empty(&req->wait.entry))) { + if (apt.error) + cancel = true; + apt.error = 0; + mask = 0; + } + if (mask || apt.error) { + list_del_init(&req->wait.entry); + } else if (cancel) { + WRITE_ONCE(req->cancelled, true); + } else if (!req->done) { /* actually waiting for an event */ + list_add_tail(&aiocb->ki_list, &ctx->active_reqs); + aiocb->ki_cancel = aio_poll_cancel; + } + spin_unlock(&req->head->lock); + } + if (mask) { /* no async, we'd stolen it */ + aiocb->ki_res.res = mangle_poll(mask); apt.error = 0; - } else if (mask || apt.error) { - /* if we get an error or a mask we are done */ - WARN_ON_ONCE(list_empty(&req->wait.entry)); - list_del_init(&req->wait.entry); - } else { - /* actually waiting for an event */ - list_add_tail(&aiocb->ki_list, &ctx->active_reqs); - aiocb->ki_cancel = aio_poll_cancel; } - spin_unlock(&req->head->lock); spin_unlock_irq(&ctx->ctx_lock); - -out: - if (unlikely(apt.error)) - return apt.error; - if (mask) - aio_poll_complete(aiocb, mask); - iocb_put(aiocb); - return 0; + iocb_put(aiocb); + return apt.error; } static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb, - struct iocb __user *user_iocb, bool compat) + struct iocb __user *user_iocb, struct aio_kiocb *req, + bool compat) { - struct aio_kiocb *req; - ssize_t ret; - - /* enforce forwards compatibility on users */ - if (unlikely(iocb->aio_reserved2)) { - pr_debug("EINVAL: reserve field set\n"); - return -EINVAL; - } - - /* prevent overflows */ - if (unlikely( - (iocb->aio_buf != (unsigned long)iocb->aio_buf) || - (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) || - ((ssize_t)iocb->aio_nbytes < 0) - )) { - pr_debug("EINVAL: overflow check\n"); - return -EINVAL; - } - - if (!get_reqs_available(ctx)) - return -EAGAIN; - - ret = -EAGAIN; - req = aio_get_req(ctx); - if (unlikely(!req)) - goto out_put_reqs_available; - req->ki_filp = fget(iocb->aio_fildes); - ret = -EBADF; if (unlikely(!req->ki_filp)) - goto out_put_req; + return -EBADF; if (iocb->aio_flags & IOCB_FLAG_RESFD) { + struct eventfd_ctx *eventfd; /* * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an * instance of the file* now. The file descriptor must be * an eventfd() fd, and will be signaled for each completed * event using the eventfd_signal() function. */ - req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd); - if (IS_ERR(req->ki_eventfd)) { - ret = PTR_ERR(req->ki_eventfd); - req->ki_eventfd = NULL; - goto out_put_req; - } + eventfd = eventfd_ctx_fdget(iocb->aio_resfd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + req->ki_eventfd = eventfd; } - ret = put_user(KIOCB_KEY, &user_iocb->aio_key); - if (unlikely(ret)) { + if (unlikely(put_user(KIOCB_KEY, &user_iocb->aio_key))) { pr_debug("EFAULT: aio_key\n"); - goto out_put_req; + return -EFAULT; } - req->ki_user_iocb = user_iocb; - req->ki_user_data = iocb->aio_data; + req->ki_res.obj = (u64)(unsigned long)user_iocb; + req->ki_res.data = iocb->aio_data; + req->ki_res.res = 0; + req->ki_res.res2 = 0; switch (iocb->aio_lio_opcode) { case IOCB_CMD_PREAD: - ret = aio_read(&req->rw, iocb, false, compat); - break; + return aio_read(&req->rw, iocb, false, compat); case IOCB_CMD_PWRITE: - ret = aio_write(&req->rw, iocb, false, compat); - break; + return aio_write(&req->rw, iocb, false, compat); case IOCB_CMD_PREADV: - ret = aio_read(&req->rw, iocb, true, compat); - break; + return aio_read(&req->rw, iocb, true, compat); case IOCB_CMD_PWRITEV: - ret = aio_write(&req->rw, iocb, true, compat); - break; + return aio_write(&req->rw, iocb, true, compat); case IOCB_CMD_FSYNC: - ret = aio_fsync(&req->fsync, iocb, false); - break; + return aio_fsync(&req->fsync, iocb, false); case IOCB_CMD_FDSYNC: - ret = aio_fsync(&req->fsync, iocb, true); - break; + return aio_fsync(&req->fsync, iocb, true); case IOCB_CMD_POLL: - ret = aio_poll(req, iocb); - break; + return aio_poll(req, iocb); default: pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode); - ret = -EINVAL; - break; + return -EINVAL; } - - /* - * If ret is 0, we'd either done aio_complete() ourselves or have - * arranged for that to be done asynchronously. Anything non-zero - * means that we need to destroy req ourselves. - */ - if (ret) - goto out_put_req; - return 0; -out_put_req: - if (req->ki_eventfd) - eventfd_ctx_put(req->ki_eventfd); - iocb_put(req); -out_put_reqs_available: - put_reqs_available(ctx, 1); - return ret; } static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, bool compat) { + struct aio_kiocb *req; struct iocb iocb; + int err; if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb)))) return -EFAULT; - return __io_submit_one(ctx, &iocb, user_iocb, compat); + /* enforce forwards compatibility on users */ + if (unlikely(iocb.aio_reserved2)) { + pr_debug("EINVAL: reserve field set\n"); + return -EINVAL; + } + + /* prevent overflows */ + if (unlikely( + (iocb.aio_buf != (unsigned long)iocb.aio_buf) || + (iocb.aio_nbytes != (size_t)iocb.aio_nbytes) || + ((ssize_t)iocb.aio_nbytes < 0) + )) { + pr_debug("EINVAL: overflow check\n"); + return -EINVAL; + } + + req = aio_get_req(ctx); + if (unlikely(!req)) + return -EAGAIN; + + err = __io_submit_one(ctx, &iocb, user_iocb, req, compat); + + /* Done with the synchronous reference */ + iocb_put(req); + + /* + * If err is 0, we'd either done aio_complete() ourselves or have + * arranged for that to be done asynchronously. Anything non-zero + * means that we need to destroy req ourselves. + */ + if (unlikely(err)) { + iocb_destroy(req); + put_reqs_available(ctx, 1); + } + return err; } /* sys_io_submit: @@ -1997,24 +1973,6 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, } #endif -/* lookup_kiocb - * Finds a given iocb for cancellation. - */ -static struct aio_kiocb * -lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb) -{ - struct aio_kiocb *kiocb; - - assert_spin_locked(&ctx->ctx_lock); - - /* TODO: use a hash or array, this sucks. */ - list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { - if (kiocb->ki_user_iocb == iocb) - return kiocb; - } - return NULL; -} - /* sys_io_cancel: * Attempts to cancel an iocb previously passed to io_submit. If * the operation is successfully cancelled, the resulting event is @@ -2032,6 +1990,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, struct aio_kiocb *kiocb; int ret = -EINVAL; u32 key; + u64 obj = (u64)(unsigned long)iocb; if (unlikely(get_user(key, &iocb->aio_key))) return -EFAULT; @@ -2043,10 +2002,13 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, return -EINVAL; spin_lock_irq(&ctx->ctx_lock); - kiocb = lookup_kiocb(ctx, iocb); - if (kiocb) { - ret = kiocb->ki_cancel(&kiocb->rw); - list_del_init(&kiocb->ki_list); + /* TODO: use a hash or array, this sucks. */ + list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { + if (kiocb->ki_res.obj == obj) { + ret = kiocb->ki_cancel(&kiocb->rw); + list_del_init(&kiocb->ki_list); + break; + } } spin_unlock_irq(&ctx->ctx_lock); diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 4700b4534439..e273850c95af 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -44,7 +44,7 @@ static struct dentry *befs_lookup(struct inode *, struct dentry *, unsigned int); static struct inode *befs_iget(struct super_block *, unsigned long); static struct inode *befs_alloc_inode(struct super_block *sb); -static void befs_destroy_inode(struct inode *inode); +static void befs_free_inode(struct inode *inode); static void befs_destroy_inodecache(void); static int befs_symlink_readpage(struct file *, struct page *); static int befs_utf2nls(struct super_block *sb, const char *in, int in_len, @@ -64,7 +64,7 @@ static struct dentry *befs_get_parent(struct dentry *child); static const struct super_operations befs_sops = { .alloc_inode = befs_alloc_inode, /* allocate a new inode */ - .destroy_inode = befs_destroy_inode, /* deallocate an inode */ + .free_inode = befs_free_inode, /* deallocate an inode */ .put_super = befs_put_super, /* uninit super */ .statfs = befs_statfs, /* statfs */ .remount_fs = befs_remount, @@ -281,17 +281,11 @@ befs_alloc_inode(struct super_block *sb) return &bi->vfs_inode; } -static void befs_i_callback(struct rcu_head *head) +static void befs_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(befs_inode_cachep, BEFS_I(inode)); } -static void befs_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, befs_i_callback); -} - static void init_once(void *foo) { struct befs_inode_info *bi = (struct befs_inode_info *) foo; diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index d136b2aaafb3..dc0cd2aa3d65 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -245,17 +245,11 @@ static struct inode *bfs_alloc_inode(struct super_block *sb) return &bi->vfs_inode; } -static void bfs_i_callback(struct rcu_head *head) +static void bfs_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(bfs_inode_cachep, BFS_I(inode)); } -static void bfs_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, bfs_i_callback); -} - static void init_once(void *foo) { struct bfs_inode_info *bi = foo; @@ -287,7 +281,7 @@ static void destroy_inodecache(void) static const struct super_operations bfs_sops = { .alloc_inode = bfs_alloc_inode, - .destroy_inode = bfs_destroy_inode, + .free_inode = bfs_free_inode, .write_inode = bfs_write_inode, .evict_inode = bfs_evict_inode, .put_super = bfs_put_super, diff --git a/fs/block_dev.c b/fs/block_dev.c index 78d3257435c0..500aaa3e5990 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -210,7 +210,6 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, struct bio bio; ssize_t ret; blk_qc_t qc; - int i; struct bvec_iter_all iter_all; if ((pos | iov_iter_alignment(iter)) & @@ -261,10 +260,11 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, } __set_current_state(TASK_RUNNING); - bio_for_each_segment_all(bvec, &bio, i, iter_all) { + bio_for_each_segment_all(bvec, &bio, iter_all) { if (should_dirty && !PageCompound(bvec->bv_page)) set_page_dirty_lock(bvec->bv_page); - put_page(bvec->bv_page); + if (!bio_flagged(&bio, BIO_NO_PAGE_REF)) + put_page(bvec->bv_page); } if (unlikely(bio.bi_status)) @@ -307,10 +307,10 @@ static void blkdev_bio_end_io(struct bio *bio) struct blkdev_dio *dio = bio->bi_private; bool should_dirty = dio->should_dirty; - if (dio->multi_bio && !atomic_dec_and_test(&dio->ref)) { - if (bio->bi_status && !dio->bio.bi_status) - dio->bio.bi_status = bio->bi_status; - } else { + if (bio->bi_status && !dio->bio.bi_status) + dio->bio.bi_status = bio->bi_status; + + if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) { if (!dio->is_sync) { struct kiocb *iocb = dio->iocb; ssize_t ret; @@ -339,9 +339,8 @@ static void blkdev_bio_end_io(struct bio *bio) if (!bio_flagged(bio, BIO_NO_PAGE_REF)) { struct bvec_iter_all iter_all; struct bio_vec *bvec; - int i; - bio_for_each_segment_all(bvec, bio, i, iter_all) + bio_for_each_segment_all(bvec, bio, iter_all) put_page(bvec->bv_page); } bio_put(bio); @@ -789,17 +788,9 @@ static struct inode *bdev_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void bdev_i_callback(struct rcu_head *head) -{ - struct inode *inode = container_of(head, struct inode, i_rcu); - struct bdev_inode *bdi = BDEV_I(inode); - - kmem_cache_free(bdev_cachep, bdi); -} - -static void bdev_destroy_inode(struct inode *inode) +static void bdev_free_inode(struct inode *inode) { - call_rcu(&inode->i_rcu, bdev_i_callback); + kmem_cache_free(bdev_cachep, BDEV_I(inode)); } static void init_once(void *foo) @@ -839,7 +830,7 @@ static void bdev_evict_inode(struct inode *inode) static const struct super_operations bdev_sops = { .statfs = simple_statfs, .alloc_inode = bdev_alloc_inode, - .destroy_inode = bdev_destroy_inode, + .free_inode = bdev_free_inode, .drop_inode = generic_delete_inode, .evict_inode = bdev_evict_inode, }; diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 5810463dc6d2..a0af1b952c4d 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -93,7 +93,11 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans, goto out; } - ret = btrfs_setxattr(trans, inode, name, value, size, 0); + if (trans) + ret = btrfs_setxattr(trans, inode, name, value, size, 0); + else + ret = btrfs_setxattr_trans(inode, name, value, size, 0); + out: kfree(value); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 11459fe84a29..982152d3f920 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -791,7 +791,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, count = node->ref_mod * -1; break; default: - BUG_ON(1); + BUG(); } *total_refs += count; switch (node->type) { @@ -1460,8 +1460,8 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, * callers (such as fiemap) which want to know whether the extent is * shared but do not need a ref count. * - * This attempts to allocate a transaction in order to account for - * delayed refs, but continues on even when the alloc fails. + * This attempts to attach to the running transaction in order to account for + * delayed refs, but continues on even when no running transaction exists. * * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error. */ @@ -1484,13 +1484,16 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr) tmp = ulist_alloc(GFP_NOFS); roots = ulist_alloc(GFP_NOFS); if (!tmp || !roots) { - ulist_free(tmp); - ulist_free(roots); - return -ENOMEM; + ret = -ENOMEM; + goto out; } - trans = btrfs_join_transaction(root); + trans = btrfs_attach_transaction(root); if (IS_ERR(trans)) { + if (PTR_ERR(trans) != -ENOENT && PTR_ERR(trans) != -EROFS) { + ret = PTR_ERR(trans); + goto out; + } trans = NULL; down_read(&fs_info->commit_root_sem); } else { @@ -1523,6 +1526,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr) } else { up_read(&fs_info->commit_root_sem); } +out: ulist_free(tmp); ulist_free(roots); return ret; @@ -1747,7 +1751,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, else if (flags & BTRFS_EXTENT_FLAG_DATA) *flags_ret = BTRFS_EXTENT_FLAG_DATA; else - BUG_ON(1); + BUG(); return 0; } @@ -1912,13 +1916,19 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, extent_item_objectid); if (!search_commit_root) { - trans = btrfs_join_transaction(fs_info->extent_root); - if (IS_ERR(trans)) - return PTR_ERR(trans); + trans = btrfs_attach_transaction(fs_info->extent_root); + if (IS_ERR(trans)) { + if (PTR_ERR(trans) != -ENOENT && + PTR_ERR(trans) != -EROFS) + return PTR_ERR(trans); + trans = NULL; + } + } + + if (trans) btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); - } else { + else down_read(&fs_info->commit_root_sem); - } ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, tree_mod_seq_elem.seq, &refs, @@ -1951,7 +1961,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, free_leaf_list(refs); out: - if (!search_commit_root) { + if (trans) { btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); btrfs_end_transaction(trans); } else { diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 6f5d07415dab..d5b438706b77 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -148,12 +148,6 @@ struct btrfs_inode { u64 last_unlink_trans; /* - * Track the transaction id of the last transaction used to create a - * hard link for the inode. This is used by the log tree (fsync). - */ - u64 last_link_trans; - - /* * Number of bytes outstanding that are going to need csums. This is * used in ENOSPC accounting. */ @@ -203,8 +197,6 @@ struct btrfs_inode { struct inode vfs_inode; }; -extern unsigned char btrfs_filetype_table[]; - static inline struct btrfs_inode *BTRFS_I(const struct inode *inode) { return container_of(inode, struct btrfs_inode, vfs_inode); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 4f2a8ae0aa42..daf7908d1e35 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -160,7 +160,6 @@ csum_failed: if (cb->errors) { bio_io_error(cb->orig_bio); } else { - int i; struct bio_vec *bvec; struct bvec_iter_all iter_all; @@ -169,7 +168,7 @@ csum_failed: * checked so the end_io handlers know about it */ ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, cb->orig_bio, i, iter_all) + bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) SetPageChecked(bvec->bv_page); bio_endio(cb->orig_bio); @@ -251,7 +250,7 @@ static void end_compressed_bio_write(struct bio *bio) cb->compressed_pages[0]->mapping = cb->inode->i_mapping; btrfs_writepage_endio_finish_ordered(cb->compressed_pages[0], cb->start, cb->start + cb->len - 1, - bio->bi_status ? BLK_STS_OK : BLK_STS_NOTSUPP); + bio->bi_status == BLK_STS_OK); cb->compressed_pages[0]->mapping = NULL; end_compressed_writeback(inode, cb); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 324df36d28bf..5df76c17775a 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -21,11 +21,9 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *ins_key, struct btrfs_path *path, int data_size, int extend); static int push_node_left(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct extent_buffer *dst, struct extent_buffer *src, int empty); static int balance_node_right(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct extent_buffer *dst_buf, struct extent_buffer *src_buf); static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, @@ -726,11 +724,11 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) return __tree_mod_log_search(fs_info, start, min_seq, 0); } -static noinline int -tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, +static noinline int tree_mod_log_eb_copy(struct extent_buffer *dst, struct extent_buffer *src, unsigned long dst_offset, unsigned long src_offset, int nr_items) { + struct btrfs_fs_info *fs_info = dst->fs_info; int ret = 0; struct tree_mod_elem **tm_list = NULL; struct tree_mod_elem **tm_list_add, **tm_list_rem; @@ -950,7 +948,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (new_flags != 0) { int level = btrfs_header_level(buf); - ret = btrfs_set_disk_extent_flags(trans, fs_info, + ret = btrfs_set_disk_extent_flags(trans, buf->start, buf->len, new_flags, level, 0); @@ -970,7 +968,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (ret) return ret; } - clean_tree_block(fs_info, buf); + btrfs_clean_tree_block(buf); *last_ref = 1; } return 0; @@ -1792,9 +1790,8 @@ static void root_sub_used(struct btrfs_root *root, u32 size) /* given a node and slot number, this reads the blocks it points to. The * extent buffer is returned with a reference taken (but unlocked). */ -static noinline struct extent_buffer * -read_node_slot(struct btrfs_fs_info *fs_info, struct extent_buffer *parent, - int slot) +static noinline struct extent_buffer *read_node_slot( + struct extent_buffer *parent, int slot) { int level = btrfs_header_level(parent); struct extent_buffer *eb; @@ -1806,7 +1803,7 @@ read_node_slot(struct btrfs_fs_info *fs_info, struct extent_buffer *parent, BUG_ON(level == 0); btrfs_node_key_to_cpu(parent, &first_key, slot); - eb = read_tree_block(fs_info, btrfs_node_blockptr(parent, slot), + eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot), btrfs_node_ptr_generation(parent, slot), level - 1, &first_key); if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) { @@ -1863,7 +1860,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, return 0; /* promote the child to a root */ - child = read_node_slot(fs_info, mid, 0); + child = read_node_slot(mid, 0); if (IS_ERR(child)) { ret = PTR_ERR(child); btrfs_handle_fs_error(fs_info, ret, NULL); @@ -1888,7 +1885,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, path->locks[level] = 0; path->nodes[level] = NULL; - clean_tree_block(fs_info, mid); + btrfs_clean_tree_block(mid); btrfs_tree_unlock(mid); /* once for the path */ free_extent_buffer(mid); @@ -1903,7 +1900,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4) return 0; - left = read_node_slot(fs_info, parent, pslot - 1); + left = read_node_slot(parent, pslot - 1); if (IS_ERR(left)) left = NULL; @@ -1918,7 +1915,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, } } - right = read_node_slot(fs_info, parent, pslot + 1); + right = read_node_slot(parent, pslot + 1); if (IS_ERR(right)) right = NULL; @@ -1936,7 +1933,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* first, try to make some room in the middle buffer */ if (left) { orig_slot += btrfs_header_nritems(left); - wret = push_node_left(trans, fs_info, left, mid, 1); + wret = push_node_left(trans, left, mid, 1); if (wret < 0) ret = wret; } @@ -1945,11 +1942,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, * then try to empty the right most buffer into the middle */ if (right) { - wret = push_node_left(trans, fs_info, mid, right, 1); + wret = push_node_left(trans, mid, right, 1); if (wret < 0 && wret != -ENOSPC) ret = wret; if (btrfs_header_nritems(right) == 0) { - clean_tree_block(fs_info, right); + btrfs_clean_tree_block(right); btrfs_tree_unlock(right); del_ptr(root, path, level + 1, pslot + 1); root_sub_used(root, right->len); @@ -1981,20 +1978,20 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_handle_fs_error(fs_info, ret, NULL); goto enospc; } - wret = balance_node_right(trans, fs_info, mid, left); + wret = balance_node_right(trans, mid, left); if (wret < 0) { ret = wret; goto enospc; } if (wret == 1) { - wret = push_node_left(trans, fs_info, left, mid, 1); + wret = push_node_left(trans, left, mid, 1); if (wret < 0) ret = wret; } BUG_ON(wret == 1); } if (btrfs_header_nritems(mid) == 0) { - clean_tree_block(fs_info, mid); + btrfs_clean_tree_block(mid); btrfs_tree_unlock(mid); del_ptr(root, path, level + 1, pslot); root_sub_used(root, mid->len); @@ -2078,7 +2075,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, if (!parent) return 1; - left = read_node_slot(fs_info, parent, pslot - 1); + left = read_node_slot(parent, pslot - 1); if (IS_ERR(left)) left = NULL; @@ -2098,8 +2095,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, if (ret) wret = 1; else { - wret = push_node_left(trans, fs_info, - left, mid, 0); + wret = push_node_left(trans, left, mid, 0); } } if (wret < 0) @@ -2131,7 +2127,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_tree_unlock(left); free_extent_buffer(left); } - right = read_node_slot(fs_info, parent, pslot + 1); + right = read_node_slot(parent, pslot + 1); if (IS_ERR(right)) right = NULL; @@ -2154,8 +2150,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, if (ret) wret = 1; else { - wret = balance_node_right(trans, fs_info, - right, mid); + wret = balance_node_right(trans, right, mid); } } if (wret < 0) @@ -2416,6 +2411,16 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, if (tmp) { /* first we do an atomic uptodate check */ if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { + /* + * Do extra check for first_key, eb can be stale due to + * being cached, read from scrub, or have multiple + * parents (shared tree blocks). + */ + if (btrfs_verify_level_key(tmp, + parent_level - 1, &first_key, gen)) { + free_extent_buffer(tmp); + return -EUCLEAN; + } *eb_ret = tmp; return 0; } @@ -2706,7 +2711,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_path *p, int ins_len, int cow) { - struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *b; int slot; int ret; @@ -2904,7 +2908,7 @@ cow_done: } else { p->slots[level] = slot; if (ins_len > 0 && - btrfs_leaf_free_space(fs_info, b) < ins_len) { + btrfs_leaf_free_space(b) < ins_len) { if (write_lock_level < 1) { write_lock_level = 1; btrfs_release_path(p); @@ -3181,11 +3185,31 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, slot = path->slots[0]; if (slot > 0) { btrfs_item_key(eb, &disk_key, slot - 1); - BUG_ON(comp_keys(&disk_key, new_key) >= 0); + if (unlikely(comp_keys(&disk_key, new_key) >= 0)) { + btrfs_crit(fs_info, + "slot %u key (%llu %u %llu) new key (%llu %u %llu)", + slot, btrfs_disk_key_objectid(&disk_key), + btrfs_disk_key_type(&disk_key), + btrfs_disk_key_offset(&disk_key), + new_key->objectid, new_key->type, + new_key->offset); + btrfs_print_leaf(eb); + BUG(); + } } if (slot < btrfs_header_nritems(eb) - 1) { btrfs_item_key(eb, &disk_key, slot + 1); - BUG_ON(comp_keys(&disk_key, new_key) <= 0); + if (unlikely(comp_keys(&disk_key, new_key) <= 0)) { + btrfs_crit(fs_info, + "slot %u key (%llu %u %llu) new key (%llu %u %llu)", + slot, btrfs_disk_key_objectid(&disk_key), + btrfs_disk_key_type(&disk_key), + btrfs_disk_key_offset(&disk_key), + new_key->objectid, new_key->type, + new_key->offset); + btrfs_print_leaf(eb); + BUG(); + } } btrfs_cpu_key_to_disk(&disk_key, new_key); @@ -3203,10 +3227,10 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, * error, and > 0 if there was no room in the left hand block. */ static int push_node_left(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct extent_buffer *dst, struct extent_buffer *src, int empty) { + struct btrfs_fs_info *fs_info = trans->fs_info; int push_items = 0; int src_nritems; int dst_nritems; @@ -3239,8 +3263,7 @@ static int push_node_left(struct btrfs_trans_handle *trans, } else push_items = min(src_nritems - 8, push_items); - ret = tree_mod_log_eb_copy(fs_info, dst, src, dst_nritems, 0, - push_items); + ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items); if (ret) { btrfs_abort_transaction(trans, ret); return ret; @@ -3278,10 +3301,10 @@ static int push_node_left(struct btrfs_trans_handle *trans, * this will only push up to 1/2 the contents of the left node over */ static int balance_node_right(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct extent_buffer *dst, struct extent_buffer *src) { + struct btrfs_fs_info *fs_info = trans->fs_info; int push_items = 0; int max_push; int src_nritems; @@ -3315,8 +3338,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans, (dst_nritems) * sizeof(struct btrfs_key_ptr)); - ret = tree_mod_log_eb_copy(fs_info, dst, src, 0, - src_nritems - push_items, push_items); + ret = tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items, + push_items); if (ret) { btrfs_abort_transaction(trans, ret); return ret; @@ -3404,7 +3427,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, * blocknr is the block the key points to. */ static void insert_ptr(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_path *path, + struct btrfs_path *path, struct btrfs_disk_key *key, u64 bytenr, int slot, int level) { @@ -3417,7 +3440,7 @@ static void insert_ptr(struct btrfs_trans_handle *trans, lower = path->nodes[level]; nritems = btrfs_header_nritems(lower); BUG_ON(slot > nritems); - BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(fs_info)); + BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info)); if (slot != nritems) { if (level) { ret = tree_mod_log_insert_move(lower, slot + 1, slot, @@ -3501,7 +3524,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, root_add_used(root, fs_info->nodesize); ASSERT(btrfs_header_level(c) == level); - ret = tree_mod_log_eb_copy(fs_info, split, c, 0, mid, c_nritems - mid); + ret = tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); if (ret) { btrfs_abort_transaction(trans, ret); return ret; @@ -3517,7 +3540,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(c); btrfs_mark_buffer_dirty(split); - insert_ptr(trans, fs_info, path, &disk_key, split->start, + insert_ptr(trans, path, &disk_key, split->start, path->slots[level + 1] + 1, level + 1); if (path->slots[level] >= mid) { @@ -3565,9 +3588,9 @@ static int leaf_space_used(struct extent_buffer *l, int start, int nr) * the start of the leaf data. IOW, how much room * the leaf has left for both items and data */ -noinline int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf) +noinline int btrfs_leaf_free_space(struct extent_buffer *leaf) { + struct btrfs_fs_info *fs_info = leaf->fs_info; int nritems = btrfs_header_nritems(leaf); int ret; @@ -3586,13 +3609,13 @@ noinline int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info, * min slot controls the lowest index we're willing to push to the * right. We'll push up to and including min_slot, but no lower */ -static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, +static noinline int __push_leaf_right(struct btrfs_path *path, int data_size, int empty, struct extent_buffer *right, int free_space, u32 left_nritems, u32 min_slot) { + struct btrfs_fs_info *fs_info = right->fs_info; struct extent_buffer *left = path->nodes[0]; struct extent_buffer *upper = path->nodes[1]; struct btrfs_map_token token; @@ -3626,7 +3649,8 @@ static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info, if (path->slots[0] > i) break; if (path->slots[0] == i) { - int space = btrfs_leaf_free_space(fs_info, left); + int space = btrfs_leaf_free_space(left); + if (space + push_space * 2 > free_space) break; } @@ -3655,10 +3679,10 @@ static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info, right_nritems = btrfs_header_nritems(right); push_space = btrfs_item_end_nr(left, left_nritems - push_items); - push_space -= leaf_data_end(fs_info, left); + push_space -= leaf_data_end(left); /* make room in the right data area */ - data_end = leaf_data_end(fs_info, right); + data_end = leaf_data_end(right); memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET + data_end - push_space, BTRFS_LEAF_DATA_OFFSET + data_end, @@ -3667,7 +3691,7 @@ static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info, /* copy from the left data area */ copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, - BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, left), + BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left), push_space); memmove_extent_buffer(right, btrfs_item_nr_offset(push_items), @@ -3695,7 +3719,7 @@ static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info, if (left_nritems) btrfs_mark_buffer_dirty(left); else - clean_tree_block(fs_info, left); + btrfs_clean_tree_block(left); btrfs_mark_buffer_dirty(right); @@ -3707,7 +3731,7 @@ static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info, if (path->slots[0] >= left_nritems) { path->slots[0] -= left_nritems; if (btrfs_header_nritems(path->nodes[0]) == 0) - clean_tree_block(fs_info, path->nodes[0]); + btrfs_clean_tree_block(path->nodes[0]); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -3739,7 +3763,6 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root int min_data_size, int data_size, int empty, u32 min_slot) { - struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *left = path->nodes[0]; struct extent_buffer *right; struct extent_buffer *upper; @@ -3758,7 +3781,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_locked(path->nodes[1]); - right = read_node_slot(fs_info, upper, slot + 1); + right = read_node_slot(upper, slot + 1); /* * slot + 1 is not valid or we fail to read the right node, * no big deal, just return. @@ -3769,7 +3792,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_tree_lock(right); btrfs_set_lock_blocking_write(right); - free_space = btrfs_leaf_free_space(fs_info, right); + free_space = btrfs_leaf_free_space(right); if (free_space < data_size) goto out_unlock; @@ -3779,7 +3802,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (ret) goto out_unlock; - free_space = btrfs_leaf_free_space(fs_info, right); + free_space = btrfs_leaf_free_space(right); if (free_space < data_size) goto out_unlock; @@ -3800,7 +3823,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root return 0; } - return __push_leaf_right(fs_info, path, min_data_size, empty, + return __push_leaf_right(path, min_data_size, empty, right, free_space, left_nritems, min_slot); out_unlock: btrfs_tree_unlock(right); @@ -3816,12 +3839,12 @@ out_unlock: * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the * items */ -static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, int data_size, +static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, int empty, struct extent_buffer *left, int free_space, u32 right_nritems, u32 max_slot) { + struct btrfs_fs_info *fs_info = left->fs_info; struct btrfs_disk_key disk_key; struct extent_buffer *right = path->nodes[0]; int i; @@ -3849,7 +3872,8 @@ static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info, if (path->slots[0] < i) break; if (path->slots[0] == i) { - int space = btrfs_leaf_free_space(fs_info, right); + int space = btrfs_leaf_free_space(right); + if (space + push_space * 2 > free_space) break; } @@ -3882,7 +3906,7 @@ static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info, btrfs_item_offset_nr(right, push_items - 1); copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET + - leaf_data_end(fs_info, left) - push_space, + leaf_data_end(left) - push_space, BTRFS_LEAF_DATA_OFFSET + btrfs_item_offset_nr(right, push_items - 1), push_space); @@ -3909,11 +3933,11 @@ static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info, if (push_items < right_nritems) { push_space = btrfs_item_offset_nr(right, push_items - 1) - - leaf_data_end(fs_info, right); + leaf_data_end(right); memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, BTRFS_LEAF_DATA_OFFSET + - leaf_data_end(fs_info, right), push_space); + leaf_data_end(right), push_space); memmove_extent_buffer(right, btrfs_item_nr_offset(0), btrfs_item_nr_offset(push_items), @@ -3935,7 +3959,7 @@ static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info, if (right_nritems) btrfs_mark_buffer_dirty(right); else - clean_tree_block(fs_info, right); + btrfs_clean_tree_block(right); btrfs_item_key(right, &disk_key, 0); fixup_low_keys(path, &disk_key, 1); @@ -3972,7 +3996,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int min_data_size, int data_size, int empty, u32 max_slot) { - struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *right = path->nodes[0]; struct extent_buffer *left; int slot; @@ -3992,7 +4015,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_locked(path->nodes[1]); - left = read_node_slot(fs_info, path->nodes[1], slot - 1); + left = read_node_slot(path->nodes[1], slot - 1); /* * slot - 1 is not valid or we fail to read the left node, * no big deal, just return. @@ -4003,7 +4026,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_tree_lock(left); btrfs_set_lock_blocking_write(left); - free_space = btrfs_leaf_free_space(fs_info, left); + free_space = btrfs_leaf_free_space(left); if (free_space < data_size) { ret = 1; goto out; @@ -4019,13 +4042,13 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root goto out; } - free_space = btrfs_leaf_free_space(fs_info, left); + free_space = btrfs_leaf_free_space(left); if (free_space < data_size) { ret = 1; goto out; } - return __push_leaf_left(fs_info, path, min_data_size, + return __push_leaf_left(path, min_data_size, empty, left, free_space, right_nritems, max_slot); out: @@ -4039,12 +4062,12 @@ out: * available for the resulting leaf level of the path. */ static noinline void copy_for_split(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_path *path, struct extent_buffer *l, struct extent_buffer *right, int slot, int mid, int nritems) { + struct btrfs_fs_info *fs_info = trans->fs_info; int data_copy_size; int rt_data_off; int i; @@ -4055,7 +4078,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, nritems = nritems - mid; btrfs_set_header_nritems(right, nritems); - data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(fs_info, l); + data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l); copy_extent_buffer(right, l, btrfs_item_nr_offset(0), btrfs_item_nr_offset(mid), @@ -4064,7 +4087,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, copy_extent_buffer(right, l, BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) - data_copy_size, BTRFS_LEAF_DATA_OFFSET + - leaf_data_end(fs_info, l), data_copy_size); + leaf_data_end(l), data_copy_size); rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid); @@ -4079,8 +4102,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(l, mid); btrfs_item_key(right, &disk_key, 0); - insert_ptr(trans, fs_info, path, &disk_key, right->start, - path->slots[1] + 1, 1); + insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1); btrfs_mark_buffer_dirty(right); btrfs_mark_buffer_dirty(l); @@ -4115,7 +4137,6 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans, struct btrfs_path *path, int data_size) { - struct btrfs_fs_info *fs_info = root->fs_info; int ret; int progress = 0; int slot; @@ -4124,7 +4145,7 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans, slot = path->slots[0]; if (slot < btrfs_header_nritems(path->nodes[0])) - space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]); + space_needed -= btrfs_leaf_free_space(path->nodes[0]); /* * try to push all the items after our slot into the @@ -4145,14 +4166,14 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans, if (path->slots[0] == 0 || path->slots[0] == nritems) return 0; - if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size) + if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) return 0; /* try to push all the items before our slot into the next leaf */ slot = path->slots[0]; space_needed = data_size; if (slot > 0) - space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]); + space_needed -= btrfs_leaf_free_space(path->nodes[0]); ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot); if (ret < 0) return ret; @@ -4201,7 +4222,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, int space_needed = data_size; if (slot < btrfs_header_nritems(l)) - space_needed -= btrfs_leaf_free_space(fs_info, l); + space_needed -= btrfs_leaf_free_space(l); wret = push_leaf_right(trans, root, path, space_needed, space_needed, 0, 0); @@ -4210,8 +4231,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, if (wret) { space_needed = data_size; if (slot > 0) - space_needed -= btrfs_leaf_free_space(fs_info, - l); + space_needed -= btrfs_leaf_free_space(l); wret = push_leaf_left(trans, root, path, space_needed, space_needed, 0, (u32)-1); if (wret < 0) @@ -4220,7 +4240,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, l = path->nodes[0]; /* did the pushes work? */ - if (btrfs_leaf_free_space(fs_info, l) >= data_size) + if (btrfs_leaf_free_space(l) >= data_size) return 0; } @@ -4288,7 +4308,7 @@ again: if (split == 0) { if (mid <= slot) { btrfs_set_header_nritems(right, 0); - insert_ptr(trans, fs_info, path, &disk_key, + insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); @@ -4297,7 +4317,7 @@ again: path->slots[1] += 1; } else { btrfs_set_header_nritems(right, 0); - insert_ptr(trans, fs_info, path, &disk_key, + insert_ptr(trans, path, &disk_key, right->start, path->slots[1], 1); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); @@ -4314,7 +4334,7 @@ again: return ret; } - copy_for_split(trans, fs_info, path, l, right, slot, mid, nritems); + copy_for_split(trans, path, l, right, slot, mid, nritems); if (split == 2) { BUG_ON(num_doubles != 0); @@ -4327,7 +4347,7 @@ again: push_for_double: push_for_double_split(trans, root, path, data_size); tried_avoid_double = 1; - if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size) + if (btrfs_leaf_free_space(path->nodes[0]) >= data_size) return 0; goto again; } @@ -4336,7 +4356,6 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int ins_len) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; @@ -4350,7 +4369,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && key.type != BTRFS_EXTENT_CSUM_KEY); - if (btrfs_leaf_free_space(fs_info, leaf) >= ins_len) + if (btrfs_leaf_free_space(leaf) >= ins_len) return 0; item_size = btrfs_item_size_nr(leaf, path->slots[0]); @@ -4377,7 +4396,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, goto err; /* the leaf has changed, it now has room. return now */ - if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= ins_len) + if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len) goto err; if (key.type == BTRFS_EXTENT_DATA_KEY) { @@ -4400,8 +4419,7 @@ err: return ret; } -static noinline int split_item(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, +static noinline int split_item(struct btrfs_path *path, const struct btrfs_key *new_key, unsigned long split_offset) { @@ -4416,7 +4434,7 @@ static noinline int split_item(struct btrfs_fs_info *fs_info, struct btrfs_disk_key disk_key; leaf = path->nodes[0]; - BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < sizeof(struct btrfs_item)); + BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item)); btrfs_set_path_blocking(path); @@ -4465,7 +4483,7 @@ static noinline int split_item(struct btrfs_fs_info *fs_info, item_size - split_offset); btrfs_mark_buffer_dirty(leaf); - BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < 0); + BUG_ON(btrfs_leaf_free_space(leaf) < 0); kfree(buf); return 0; } @@ -4497,7 +4515,7 @@ int btrfs_split_item(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = split_item(root->fs_info, path, new_key, split_offset); + ret = split_item(path, new_key, split_offset); return ret; } @@ -4543,8 +4561,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, * off the end of the item or if we shift the item to chop bytes off * the front. */ -void btrfs_truncate_item(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, u32 new_size, int from_end) +void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) { int slot; struct extent_buffer *leaf; @@ -4567,7 +4584,7 @@ void btrfs_truncate_item(struct btrfs_fs_info *fs_info, return; nritems = btrfs_header_nritems(leaf); - data_end = leaf_data_end(fs_info, leaf); + data_end = leaf_data_end(leaf); old_data_start = btrfs_item_offset_nr(leaf, slot); @@ -4633,7 +4650,7 @@ void btrfs_truncate_item(struct btrfs_fs_info *fs_info, btrfs_set_item_size(leaf, item, new_size); btrfs_mark_buffer_dirty(leaf); - if (btrfs_leaf_free_space(fs_info, leaf) < 0) { + if (btrfs_leaf_free_space(leaf) < 0) { btrfs_print_leaf(leaf); BUG(); } @@ -4642,8 +4659,7 @@ void btrfs_truncate_item(struct btrfs_fs_info *fs_info, /* * make the item pointed to by the path bigger, data_size is the added size. */ -void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - u32 data_size) +void btrfs_extend_item(struct btrfs_path *path, u32 data_size) { int slot; struct extent_buffer *leaf; @@ -4660,9 +4676,9 @@ void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path, leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); - data_end = leaf_data_end(fs_info, leaf); + data_end = leaf_data_end(leaf); - if (btrfs_leaf_free_space(fs_info, leaf) < data_size) { + if (btrfs_leaf_free_space(leaf) < data_size) { btrfs_print_leaf(leaf); BUG(); } @@ -4672,9 +4688,9 @@ void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path, BUG_ON(slot < 0); if (slot >= nritems) { btrfs_print_leaf(leaf); - btrfs_crit(fs_info, "slot %d too large, nritems %d", + btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d", slot, nritems); - BUG_ON(1); + BUG(); } /* @@ -4701,7 +4717,7 @@ void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path, btrfs_set_item_size(leaf, item, old_size + data_size); btrfs_mark_buffer_dirty(leaf); - if (btrfs_leaf_free_space(fs_info, leaf) < 0) { + if (btrfs_leaf_free_space(leaf) < 0) { btrfs_print_leaf(leaf); BUG(); } @@ -4738,12 +4754,12 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, slot = path->slots[0]; nritems = btrfs_header_nritems(leaf); - data_end = leaf_data_end(fs_info, leaf); + data_end = leaf_data_end(leaf); - if (btrfs_leaf_free_space(fs_info, leaf) < total_size) { + if (btrfs_leaf_free_space(leaf) < total_size) { btrfs_print_leaf(leaf); btrfs_crit(fs_info, "not enough freespace need %u have %d", - total_size, btrfs_leaf_free_space(fs_info, leaf)); + total_size, btrfs_leaf_free_space(leaf)); BUG(); } @@ -4754,7 +4770,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, btrfs_print_leaf(leaf); btrfs_crit(fs_info, "slot %d old_data %d data_end %d", slot, old_data, data_end); - BUG_ON(1); + BUG(); } /* * item0..itemN ... dataN.offset..dataN.size .. data0.size @@ -4794,7 +4810,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, btrfs_set_header_nritems(leaf, nritems + nr); btrfs_mark_buffer_dirty(leaf); - if (btrfs_leaf_free_space(fs_info, leaf) < 0) { + if (btrfs_leaf_free_space(leaf) < 0) { btrfs_print_leaf(leaf); BUG(); } @@ -4966,7 +4982,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, nritems = btrfs_header_nritems(leaf); if (slot + nr != nritems) { - int data_end = leaf_data_end(fs_info, leaf); + int data_end = leaf_data_end(leaf); memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + data_end + dsize, @@ -4996,7 +5012,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_header_level(leaf, 0); } else { btrfs_set_path_blocking(path); - clean_tree_block(fs_info, leaf); + btrfs_clean_tree_block(leaf); btrfs_del_leaf(trans, root, path, leaf); } } else { @@ -5126,7 +5142,6 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, struct btrfs_path *path, u64 min_trans) { - struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *cur; struct btrfs_key found_key; int slot; @@ -5207,7 +5222,7 @@ find_next_key: goto out; } btrfs_set_path_blocking(path); - cur = read_node_slot(fs_info, cur, slot); + cur = read_node_slot(cur, slot); if (IS_ERR(cur)) { ret = PTR_ERR(cur); goto out; @@ -5229,14 +5244,12 @@ out: return ret; } -static int tree_move_down(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, - int *level) +static int tree_move_down(struct btrfs_path *path, int *level) { struct extent_buffer *eb; BUG_ON(*level == 0); - eb = read_node_slot(fs_info, path->nodes[*level], path->slots[*level]); + eb = read_node_slot(path->nodes[*level], path->slots[*level]); if (IS_ERR(eb)) return PTR_ERR(eb); @@ -5276,8 +5289,7 @@ static int tree_move_next_or_upnext(struct btrfs_path *path, * Returns 1 if it had to move up and next. 0 is returned if it moved only next * or down. */ -static int tree_advance(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, +static int tree_advance(struct btrfs_path *path, int *level, int root_level, int allow_down, struct btrfs_key *key) @@ -5287,7 +5299,7 @@ static int tree_advance(struct btrfs_fs_info *fs_info, if (*level == 0 || !allow_down) { ret = tree_move_next_or_upnext(path, level, root_level); } else { - ret = tree_move_down(fs_info, path, level); + ret = tree_move_down(path, level); } if (ret >= 0) { if (*level == 0) @@ -5464,7 +5476,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, while (1) { if (advance_left && !left_end_reached) { - ret = tree_advance(fs_info, left_path, &left_level, + ret = tree_advance(left_path, &left_level, left_root_level, advance_left != ADVANCE_ONLY_NEXT, &left_key); @@ -5475,7 +5487,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, advance_left = 0; } if (advance_right && !right_end_reached) { - ret = tree_advance(fs_info, right_path, &right_level, + ret = tree_advance(right_path, &right_level, right_root_level, advance_right != ADVANCE_ONLY_NEXT, &right_key); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b3642367a595..0a61dff27f57 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -41,6 +41,7 @@ extern struct kmem_cache *btrfs_bit_radix_cachep; extern struct kmem_cache *btrfs_path_cachep; extern struct kmem_cache *btrfs_free_space_cachep; struct btrfs_ordered_sum; +struct btrfs_ref; #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ @@ -1015,6 +1016,7 @@ struct btrfs_fs_info { /* used to keep from writing metadata until there is a nice batch */ struct percpu_counter dirty_metadata_bytes; struct percpu_counter delalloc_bytes; + struct percpu_counter dio_bytes; s32 dirty_metadata_batch; s32 delalloc_batch; @@ -1092,10 +1094,7 @@ struct btrfs_fs_info { /* holds configuration and tracking. Protected by qgroup_lock */ struct rb_root qgroup_tree; - struct rb_root qgroup_op_tree; spinlock_t qgroup_lock; - spinlock_t qgroup_op_lock; - atomic_t qgroup_op_seq; /* * used to avoid frequently calling ulist_alloc()/ulist_free() @@ -1152,12 +1151,6 @@ struct btrfs_fs_info { struct mutex unused_bg_unpin_mutex; struct mutex delete_unused_bgs_mutex; - /* - * Chunks that can't be freed yet (under a trim/discard operation) - * and will be latter freed. Protected by fs_info->chunk_mutex. - */ - struct list_head pinned_chunks; - /* Cached block sizes */ u32 nodesize; u32 sectorsize; @@ -1348,6 +1341,12 @@ struct btrfs_root { * manipulation with the read-only status via SUBVOL_SETFLAGS */ int send_in_progress; + /* + * Number of currently running deduplication operations that have a + * destination inode belonging to this root. Protected by the lock + * root_item_lock. + */ + int dedupe_in_progress; struct btrfs_subvolume_writers *subv_writers; atomic_t will_be_snapshotted; atomic_t snapshot_force_cow; @@ -1540,6 +1539,21 @@ do { \ #define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31) +#define BTRFS_INODE_FLAG_MASK \ + (BTRFS_INODE_NODATASUM | \ + BTRFS_INODE_NODATACOW | \ + BTRFS_INODE_READONLY | \ + BTRFS_INODE_NOCOMPRESS | \ + BTRFS_INODE_PREALLOC | \ + BTRFS_INODE_SYNC | \ + BTRFS_INODE_IMMUTABLE | \ + BTRFS_INODE_APPEND | \ + BTRFS_INODE_NODUMP | \ + BTRFS_INODE_NOATIME | \ + BTRFS_INODE_DIRSYNC | \ + BTRFS_INODE_COMPRESS | \ + BTRFS_INODE_ROOT_ITEM_INIT) + struct btrfs_map_token { const struct extent_buffer *eb; char *kaddr; @@ -2163,18 +2177,16 @@ static inline int btrfs_header_flag(const struct extent_buffer *eb, u64 flag) return (btrfs_header_flags(eb) & flag) == flag; } -static inline int btrfs_set_header_flag(struct extent_buffer *eb, u64 flag) +static inline void btrfs_set_header_flag(struct extent_buffer *eb, u64 flag) { u64 flags = btrfs_header_flags(eb); btrfs_set_header_flags(eb, flags | flag); - return (flags & flag) == flag; } -static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag) +static inline void btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag) { u64 flags = btrfs_header_flags(eb); btrfs_set_header_flags(eb, flags & ~flag); - return (flags & flag) == flag; } static inline int btrfs_header_backref_rev(const struct extent_buffer *eb) @@ -2445,13 +2457,12 @@ static inline int btrfs_super_csum_size(const struct btrfs_super_block *s) * this returns the address of the start of the last item, * which is the stop of the leaf data stack */ -static inline unsigned int leaf_data_end(const struct btrfs_fs_info *fs_info, - const struct extent_buffer *leaf) +static inline unsigned int leaf_data_end(const struct extent_buffer *leaf) { u32 nr = btrfs_header_nritems(leaf); if (nr == 0) - return BTRFS_LEAF_DATA_SIZE(fs_info); + return BTRFS_LEAF_DATA_SIZE(leaf->fs_info); return btrfs_item_offset_nr(leaf, nr - 1); } @@ -2698,8 +2709,6 @@ void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg); void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count); -int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info, - unsigned long count, u64 transid, int wait); void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head); @@ -2711,8 +2720,7 @@ int btrfs_pin_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num, int reserved); int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes); -int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb); +int btrfs_exclude_logged_extents(struct extent_buffer *eb); int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, u64 bytenr); struct btrfs_block_group_cache *btrfs_lookup_block_group( @@ -2745,13 +2753,9 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref); int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 flags, int level, int is_data); -int btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, - u64 owner, u64 offset); +int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, int delalloc); @@ -2760,15 +2764,11 @@ int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info, void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset); + struct btrfs_ref *generic_ref); int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans); -int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); -int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); +int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans); +int btrfs_setup_space_cache(struct btrfs_trans_handle *trans); int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr); int btrfs_free_block_groups(struct btrfs_fs_info *info); int btrfs_read_block_groups(struct btrfs_fs_info *info); @@ -2936,10 +2936,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, struct extent_buffer **cow_ret, u64 new_root_objectid); int btrfs_block_can_be_shared(struct btrfs_root *root, struct extent_buffer *buf); -void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - u32 data_size); -void btrfs_truncate_item(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, u32 new_size, int from_end); +void btrfs_extend_item(struct btrfs_path *path, u32 data_size); +void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end); int btrfs_split_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -3015,8 +3013,7 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) { return btrfs_next_old_item(root, p, 0); } -int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf); +int btrfs_leaf_free_space(struct extent_buffer *leaf); int __must_check btrfs_drop_snapshot(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, int update_ref, int for_reloc); @@ -3267,6 +3264,7 @@ void btrfs_evict_inode(struct inode *inode); int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); +void btrfs_free_inode(struct inode *inode); int btrfs_drop_inode(struct inode *inode); int __init btrfs_init_cachep(void); void __cold btrfs_destroy_cachep(void); @@ -3755,8 +3753,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, void btrfs_scrub_pause(struct btrfs_fs_info *fs_info); void btrfs_scrub_continue(struct btrfs_fs_info *fs_info); int btrfs_scrub_cancel(struct btrfs_fs_info *info); -int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info, - struct btrfs_device *dev); +int btrfs_scrub_cancel_dev(struct btrfs_device *dev); int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, struct btrfs_scrub_progress *progress); static inline void btrfs_init_full_stripe_locks_tree( @@ -3805,6 +3802,8 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) return signal_pending(current); } +#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) + /* Sanity test specific functions */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS void btrfs_test_inode_set_ops(struct inode *inode); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index c669f250d4a0..43fdb2992956 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -691,7 +691,6 @@ static int btrfs_batch_insert_items(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_delayed_item *item) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_delayed_item *curr, *next; int free_space; int total_data_size = 0, total_size = 0; @@ -708,7 +707,7 @@ static int btrfs_batch_insert_items(struct btrfs_root *root, BUG_ON(!path->nodes[0]); leaf = path->nodes[0]; - free_space = btrfs_leaf_free_space(fs_info, leaf); + free_space = btrfs_leaf_free_space(leaf); INIT_LIST_HEAD(&head); next = item; @@ -1692,7 +1691,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, name = (char *)(di + 1); name_len = btrfs_stack_dir_name_len(di); - d_type = btrfs_filetype_table[di->type]; + d_type = fs_ftype_to_dtype(di->type); btrfs_disk_key_to_cpu(&location, &di->location); over = !dir_emit(ctx, name, name_len, diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 7d2a413df90d..a73fc23e2961 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -735,8 +735,7 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, * transaction commits. */ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, u64 parent, - u64 ref_root, int level, int action, + struct btrfs_ref *generic_ref, struct btrfs_delayed_extent_op *extent_op, int *old_ref_mod, int *new_ref_mod) { @@ -746,10 +745,18 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; int qrecord_inserted; - bool is_system = (ref_root == BTRFS_CHUNK_TREE_OBJECTID); + bool is_system; + int action = generic_ref->action; + int level = generic_ref->tree_ref.level; int ret; + u64 bytenr = generic_ref->bytenr; + u64 num_bytes = generic_ref->len; + u64 parent = generic_ref->parent; u8 ref_type; + is_system = (generic_ref->real_root == BTRFS_CHUNK_TREE_OBJECTID); + + ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action); BUG_ON(extent_op && extent_op->is_data); ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); if (!ref) @@ -762,7 +769,9 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, } if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && - is_fstree(ref_root)) { + is_fstree(generic_ref->real_root) && + is_fstree(generic_ref->tree_ref.root) && + !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); @@ -777,13 +786,14 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, ref_type = BTRFS_TREE_BLOCK_REF_KEY; init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, - ref_root, action, ref_type); - ref->root = ref_root; + generic_ref->tree_ref.root, action, ref_type); + ref->root = generic_ref->tree_ref.root; ref->parent = parent; ref->level = level; init_delayed_ref_head(head_ref, record, bytenr, num_bytes, - ref_root, 0, action, false, is_system); + generic_ref->tree_ref.root, 0, action, false, + is_system); head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; @@ -822,10 +832,9 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. */ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, - u64 parent, u64 ref_root, - u64 owner, u64 offset, u64 reserved, int action, - int *old_ref_mod, int *new_ref_mod) + struct btrfs_ref *generic_ref, + u64 reserved, int *old_ref_mod, + int *new_ref_mod) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_data_ref *ref; @@ -833,9 +842,17 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; int qrecord_inserted; + int action = generic_ref->action; int ret; + u64 bytenr = generic_ref->bytenr; + u64 num_bytes = generic_ref->len; + u64 parent = generic_ref->parent; + u64 ref_root = generic_ref->data_ref.ref_root; + u64 owner = generic_ref->data_ref.ino; + u64 offset = generic_ref->data_ref.offset; u8 ref_type; + ASSERT(generic_ref->type == BTRFS_REF_DATA && action); ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS); if (!ref) return -ENOMEM; @@ -859,7 +876,9 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, } if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && - is_fstree(ref_root)) { + is_fstree(ref_root) && + is_fstree(generic_ref->real_root) && + !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); @@ -905,8 +924,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, return 0; } -int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, +int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct btrfs_delayed_extent_op *extent_op) { diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 70606da440aa..c18f93ea88ed 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -176,6 +176,83 @@ struct btrfs_delayed_ref_root { u64 qgroup_to_skip; }; +enum btrfs_ref_type { + BTRFS_REF_NOT_SET, + BTRFS_REF_DATA, + BTRFS_REF_METADATA, + BTRFS_REF_LAST, +}; + +struct btrfs_data_ref { + /* For EXTENT_DATA_REF */ + + /* Root which refers to this data extent */ + u64 ref_root; + + /* Inode which refers to this data extent */ + u64 ino; + + /* + * file_offset - extent_offset + * + * file_offset is the key.offset of the EXTENT_DATA key. + * extent_offset is btrfs_file_extent_offset() of the EXTENT_DATA data. + */ + u64 offset; +}; + +struct btrfs_tree_ref { + /* + * Level of this tree block + * + * Shared for skinny (TREE_BLOCK_REF) and normal tree ref. + */ + int level; + + /* + * Root which refers to this tree block. + * + * For TREE_BLOCK_REF (skinny metadata, either inline or keyed) + */ + u64 root; + + /* For non-skinny metadata, no special member needed */ +}; + +struct btrfs_ref { + enum btrfs_ref_type type; + int action; + + /* + * Whether this extent should go through qgroup record. + * + * Normally false, but for certain cases like delayed subtree scan, + * setting this flag can hugely reduce qgroup overhead. + */ + bool skip_qgroup; + + /* + * Optional. For which root is this modification. + * Mostly used for qgroup optimization. + * + * When unset, data/tree ref init code will populate it. + * In certain cases, we're modifying reference for a different root. + * E.g. COW fs tree blocks for balance. + * In that case, tree_ref::root will be fs tree, but we're doing this + * for reloc tree, then we should set @real_root to reloc tree. + */ + u64 real_root; + u64 bytenr; + u64 len; + + /* Bytenr of the parent tree block */ + u64 parent; + union { + struct btrfs_data_ref data_ref; + struct btrfs_tree_ref tree_ref; + }; +}; + extern struct kmem_cache *btrfs_delayed_ref_head_cachep; extern struct kmem_cache *btrfs_delayed_tree_ref_cachep; extern struct kmem_cache *btrfs_delayed_data_ref_cachep; @@ -184,6 +261,38 @@ extern struct kmem_cache *btrfs_delayed_extent_op_cachep; int __init btrfs_delayed_ref_init(void); void __cold btrfs_delayed_ref_exit(void); +static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref, + int action, u64 bytenr, u64 len, u64 parent) +{ + generic_ref->action = action; + generic_ref->bytenr = bytenr; + generic_ref->len = len; + generic_ref->parent = parent; +} + +static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, + int level, u64 root) +{ + /* If @real_root not set, use @root as fallback */ + if (!generic_ref->real_root) + generic_ref->real_root = root; + generic_ref->tree_ref.level = level; + generic_ref->tree_ref.root = root; + generic_ref->type = BTRFS_REF_METADATA; +} + +static inline void btrfs_init_data_ref(struct btrfs_ref *generic_ref, + u64 ref_root, u64 ino, u64 offset) +{ + /* If @real_root not set, use @root as fallback */ + if (!generic_ref->real_root) + generic_ref->real_root = ref_root; + generic_ref->data_ref.ref_root = ref_root; + generic_ref->data_ref.ino = ino; + generic_ref->data_ref.offset = offset; + generic_ref->type = BTRFS_REF_DATA; +} + static inline struct btrfs_delayed_extent_op * btrfs_alloc_delayed_extent_op(void) { @@ -224,17 +333,14 @@ static inline void btrfs_put_delayed_ref_head(struct btrfs_delayed_ref_head *hea } int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, u64 parent, - u64 ref_root, int level, int action, + struct btrfs_ref *generic_ref, struct btrfs_delayed_extent_op *extent_op, int *old_ref_mod, int *new_ref_mod); int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, - u64 parent, u64 ref_root, - u64 owner, u64 offset, u64 reserved, int action, - int *old_ref_mod, int *new_ref_mod); -int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, + struct btrfs_ref *generic_ref, + u64 reserved, int *old_ref_mod, + int *new_ref_mod); +int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct btrfs_delayed_extent_op *extent_op); void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index ee193c5222b2..55c15f31d00d 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -273,9 +273,9 @@ error: * called from commit_transaction. Writes changed device replace state to * disk. */ -int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; int ret; struct btrfs_root *dev_root = fs_info->dev_root; struct btrfs_path *path; @@ -662,7 +662,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, btrfs_device_set_disk_total_bytes(tgt_device, src_device->disk_total_bytes); btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used); - ASSERT(list_empty(&src_device->resized_list)); + ASSERT(list_empty(&src_device->post_commit_list)); tgt_device->commit_total_bytes = src_device->commit_total_bytes; tgt_device->commit_bytes_used = src_device->bytes_used; @@ -696,7 +696,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, /* replace the sysfs entry */ btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device); - btrfs_rm_dev_replace_free_srcdev(fs_info, src_device); + btrfs_rm_dev_replace_free_srcdev(src_device); /* write back the superblocks */ trans = btrfs_start_transaction(root, 0); diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 4aa40bacc6cc..78c5d8f1adda 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -9,8 +9,7 @@ struct btrfs_ioctl_dev_replace_args; int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info); -int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); +int btrfs_run_dev_replace(struct btrfs_trans_handle *trans); int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args); void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 8de74d835dba..863367c2c620 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -36,7 +36,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle di = btrfs_match_dir_item_name(fs_info, path, name, name_len); if (di) return ERR_PTR(-EEXIST); - btrfs_extend_item(fs_info, path, data_size); + btrfs_extend_item(path, data_size); } else if (ret < 0) return ERR_PTR(ret); WARN_ON(ret > 0); @@ -429,8 +429,7 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_len - (ptr + sub_item_len - start)); - btrfs_truncate_item(root->fs_info, path, - item_len - sub_item_len, 1); + btrfs_truncate_item(path, item_len - sub_item_len, 1); } return ret; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6fe9197f6ee4..deb74a8c191a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -260,15 +260,12 @@ void btrfs_csum_final(u32 crc, u8 *result) } /* - * compute the csum for a btree block, and either verify it or write it - * into the csum field of the block. + * Compute the csum of a btree block and store the result to provided buffer. + * + * Returns error if the extent buffer cannot be mapped. */ -static int csum_tree_block(struct btrfs_fs_info *fs_info, - struct extent_buffer *buf, - int verify) +static int csum_tree_block(struct extent_buffer *buf, u8 *result) { - u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); - char result[BTRFS_CSUM_SIZE]; unsigned long len; unsigned long cur_len; unsigned long offset = BTRFS_CSUM_SIZE; @@ -288,7 +285,7 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, */ err = map_private_extent_buffer(buf, offset, 32, &kaddr, &map_start, &map_len); - if (err) + if (WARN_ON(err)) return err; cur_len = min(len, map_len - (offset - map_start)); crc = btrfs_csum_data(kaddr + offset - map_start, @@ -300,23 +297,6 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, btrfs_csum_final(crc, result); - if (verify) { - if (memcmp_extent_buffer(buf, result, 0, csum_size)) { - u32 val; - u32 found = 0; - memcpy(&found, result, csum_size); - - read_extent_buffer(buf, &val, 0, csum_size); - btrfs_warn_rl(fs_info, - "%s checksum verify failed on %llu wanted %X found %X level %d", - fs_info->sb->s_id, buf->start, - val, found, btrfs_header_level(buf)); - return -EUCLEAN; - } - } else { - write_extent_buffer(buf, result, 0, csum_size); - } - return 0; } @@ -414,22 +394,21 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, return ret; } -static int verify_level_key(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int level, - struct btrfs_key *first_key, u64 parent_transid) +int btrfs_verify_level_key(struct extent_buffer *eb, int level, + struct btrfs_key *first_key, u64 parent_transid) { + struct btrfs_fs_info *fs_info = eb->fs_info; int found_level; struct btrfs_key found_key; int ret; found_level = btrfs_header_level(eb); if (found_level != level) { -#ifdef CONFIG_BTRFS_DEBUG - WARN_ON(1); + WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), + KERN_ERR "BTRFS: tree level check failed\n"); btrfs_err(fs_info, "tree level mismatch detected, bytenr=%llu level expected=%u has=%u", eb->start, level, found_level); -#endif return -EIO; } @@ -450,9 +429,9 @@ static int verify_level_key(struct btrfs_fs_info *fs_info, btrfs_item_key_to_cpu(eb, &found_key, 0); ret = btrfs_comp_cpu_keys(first_key, &found_key); -#ifdef CONFIG_BTRFS_DEBUG if (ret) { - WARN_ON(1); + WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), + KERN_ERR "BTRFS: tree first key check failed\n"); btrfs_err(fs_info, "tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)", eb->start, parent_transid, first_key->objectid, @@ -460,7 +439,6 @@ static int verify_level_key(struct btrfs_fs_info *fs_info, found_key.objectid, found_key.type, found_key.offset); } -#endif return ret; } @@ -472,11 +450,11 @@ static int verify_level_key(struct btrfs_fs_info *fs_info, * @level: expected level, mandatory check * @first_key: expected key of first slot, skip check if NULL */ -static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, +static int btree_read_extent_buffer_pages(struct extent_buffer *eb, u64 parent_transid, int level, struct btrfs_key *first_key) { + struct btrfs_fs_info *fs_info = eb->fs_info; struct extent_io_tree *io_tree; int failed = 0; int ret; @@ -487,14 +465,13 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info, io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; while (1) { clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); - ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE, - mirror_num); + ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num); if (!ret) { if (verify_parent_transid(io_tree, eb, parent_transid, 0)) ret = -EIO; - else if (verify_level_key(fs_info, eb, level, - first_key, parent_transid)) + else if (btrfs_verify_level_key(eb, level, + first_key, parent_transid)) ret = -EUCLEAN; else break; @@ -519,7 +496,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info, } if (failed && !ret && failed_mirror) - repair_eb_io_failure(fs_info, eb, failed_mirror); + btrfs_repair_eb_io_failure(eb, failed_mirror); return ret; } @@ -533,7 +510,10 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) { u64 start = page_offset(page); u64 found_start; + u8 result[BTRFS_CSUM_SIZE]; + u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); struct extent_buffer *eb; + int ret; eb = (struct extent_buffer *)page->private; if (page != eb->pages[0]) @@ -552,12 +532,28 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0); - return csum_tree_block(fs_info, eb, 0); + if (csum_tree_block(eb, result)) + return -EINVAL; + + if (btrfs_header_level(eb)) + ret = btrfs_check_node(eb); + else + ret = btrfs_check_leaf_full(eb); + + if (ret < 0) { + btrfs_err(fs_info, + "block=%llu write time tree block corruption detected", + eb->start); + return ret; + } + write_extent_buffer(eb, result, 0, csum_size); + + return 0; } -static int check_tree_block_fsid(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb) +static int check_tree_block_fsid(struct extent_buffer *eb) { + struct btrfs_fs_info *fs_info = eb->fs_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; u8 fsid[BTRFS_FSID_SIZE]; int ret = 1; @@ -595,7 +591,9 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, struct extent_buffer *eb; struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; struct btrfs_fs_info *fs_info = root->fs_info; + u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); int ret = 0; + u8 result[BTRFS_CSUM_SIZE]; int reads_done; if (!page->private) @@ -625,7 +623,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, ret = -EIO; goto err; } - if (check_tree_block_fsid(fs_info, eb)) { + if (check_tree_block_fsid(eb)) { btrfs_err_rl(fs_info, "bad fsid on block %llu", eb->start); ret = -EIO; @@ -642,25 +640,44 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb, found_level); - ret = csum_tree_block(fs_info, eb, 1); + ret = csum_tree_block(eb, result); if (ret) goto err; + if (memcmp_extent_buffer(eb, result, 0, csum_size)) { + u32 val; + u32 found = 0; + + memcpy(&found, result, csum_size); + + read_extent_buffer(eb, &val, 0, csum_size); + btrfs_warn_rl(fs_info, + "%s checksum verify failed on %llu wanted %x found %x level %d", + fs_info->sb->s_id, eb->start, + val, found, btrfs_header_level(eb)); + ret = -EUCLEAN; + goto err; + } + /* * If this is a leaf block and it is corrupt, set the corrupt bit so * that we don't try and read the other copies of this block, just * return -EIO. */ - if (found_level == 0 && btrfs_check_leaf_full(fs_info, eb)) { + if (found_level == 0 && btrfs_check_leaf_full(eb)) { set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); ret = -EIO; } - if (found_level > 0 && btrfs_check_node(fs_info, eb)) + if (found_level > 0 && btrfs_check_node(eb)) ret = -EIO; if (!ret) set_extent_buffer_uptodate(eb); + else + btrfs_err(fs_info, + "block=%llu read time tree block corruption detected", + eb->start); err: if (reads_done && test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) @@ -832,11 +849,11 @@ static blk_status_t btree_csum_one_bio(struct bio *bio) { struct bio_vec *bvec; struct btrfs_root *root; - int i, ret = 0; + int ret = 0; struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, i, iter_all) { + bio_for_each_segment_all(bvec, bio, iter_all) { root = BTRFS_I(bvec->bv_page->mapping->host)->root; ret = csum_dirty_buffer(root->fs_info, bvec->bv_page); if (ret) @@ -867,11 +884,10 @@ static int check_async_write(struct btrfs_inode *bi) return 1; } -static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t btree_submit_bio_hook(struct inode *inode, struct bio *bio, + int mirror_num, + unsigned long bio_flags) { - struct inode *inode = private_data; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int async = check_async_write(BTRFS_I(inode)); blk_status_t ret; @@ -897,8 +913,7 @@ static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio, * checksumming can happen in parallel across all CPUs */ ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0, - bio_offset, private_data, - btree_submit_bio_start); + 0, inode, btree_submit_bio_start); } if (ret) @@ -1017,22 +1032,23 @@ static const struct address_space_operations btree_aops = { void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr) { struct extent_buffer *buf = NULL; - struct inode *btree_inode = fs_info->btree_inode; + int ret; buf = btrfs_find_create_tree_block(fs_info, bytenr); if (IS_ERR(buf)) return; - read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, - buf, WAIT_NONE, 0); - free_extent_buffer(buf); + + ret = read_extent_buffer_pages(buf, WAIT_NONE, 0); + if (ret < 0) + free_extent_buffer_stale(buf); + else + free_extent_buffer(buf); } int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr, int mirror_num, struct extent_buffer **eb) { struct extent_buffer *buf = NULL; - struct inode *btree_inode = fs_info->btree_inode; - struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree; int ret; buf = btrfs_find_create_tree_block(fs_info, bytenr); @@ -1041,15 +1057,14 @@ int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr, set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags); - ret = read_extent_buffer_pages(io_tree, buf, WAIT_PAGE_LOCK, - mirror_num); + ret = read_extent_buffer_pages(buf, WAIT_PAGE_LOCK, mirror_num); if (ret) { - free_extent_buffer(buf); + free_extent_buffer_stale(buf); return ret; } if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) { - free_extent_buffer(buf); + free_extent_buffer_stale(buf); return -EIO; } else if (extent_buffer_uptodate(buf)) { *eb = buf; @@ -1068,19 +1083,6 @@ struct extent_buffer *btrfs_find_create_tree_block( return alloc_extent_buffer(fs_info, bytenr); } - -int btrfs_write_tree_block(struct extent_buffer *buf) -{ - return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start, - buf->start + buf->len - 1); -} - -void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) -{ - filemap_fdatawait_range(buf->pages[0]->mapping, - buf->start, buf->start + buf->len - 1); -} - /* * Read tree block at logical address @bytenr and do variant basic but critical * verification. @@ -1100,19 +1102,19 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, if (IS_ERR(buf)) return buf; - ret = btree_read_extent_buffer_pages(fs_info, buf, parent_transid, + ret = btree_read_extent_buffer_pages(buf, parent_transid, level, first_key); if (ret) { - free_extent_buffer(buf); + free_extent_buffer_stale(buf); return ERR_PTR(ret); } return buf; } -void clean_tree_block(struct btrfs_fs_info *fs_info, - struct extent_buffer *buf) +void btrfs_clean_tree_block(struct extent_buffer *buf) { + struct btrfs_fs_info *fs_info = buf->fs_info; if (btrfs_header_generation(buf) == fs_info->running_transaction->transid) { btrfs_assert_tree_locked(buf); @@ -1208,7 +1210,8 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->log_transid_committed = -1; root->last_log_commit = 0; if (!dummy) - extent_io_tree_init(&root->dirty_log_pages, NULL); + extent_io_tree_init(fs_info, &root->dirty_log_pages, + IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL); memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); @@ -1255,9 +1258,9 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info) #endif struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 objectid) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct extent_buffer *leaf; struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_root *root; @@ -2138,8 +2141,9 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) inode->i_mapping->a_ops = &btree_aops; RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); - extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode); - BTRFS_I(inode)->io_tree.track_uptodate = 0; + extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, + IO_TREE_INODE_IO, inode); + BTRFS_I(inode)->io_tree.track_uptodate = false; extent_map_tree_init(&BTRFS_I(inode)->extent_tree); BTRFS_I(inode)->io_tree.ops = &btree_extent_io_ops; @@ -2162,7 +2166,6 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->qgroup_lock); mutex_init(&fs_info->qgroup_ioctl_lock); fs_info->qgroup_tree = RB_ROOT; - fs_info->qgroup_op_tree = RB_ROOT; INIT_LIST_HEAD(&fs_info->dirty_qgroups); fs_info->qgroup_seq = 1; fs_info->qgroup_ulist = NULL; @@ -2630,11 +2633,17 @@ int open_ctree(struct super_block *sb, goto fail; } - ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); + ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL); if (ret) { err = ret; goto fail_srcu; } + + ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); + if (ret) { + err = ret; + goto fail_dio_bytes; + } fs_info->dirty_metadata_batch = PAGE_SIZE * (1 + ilog2(nr_cpu_ids)); @@ -2667,7 +2676,6 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->defrag_inodes_lock); spin_lock_init(&fs_info->tree_mod_seq_lock); spin_lock_init(&fs_info->super_lock); - spin_lock_init(&fs_info->qgroup_op_lock); spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->unused_bgs_lock); rwlock_init(&fs_info->tree_mod_log_lock); @@ -2694,7 +2702,6 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->defrag_running, 0); - atomic_set(&fs_info->qgroup_op_seq, 0); atomic_set(&fs_info->reada_works_cnt, 0); atomic_set(&fs_info->nr_delayed_iputs, 0); atomic64_set(&fs_info->tree_mod_seq, 0); @@ -2748,8 +2755,10 @@ int open_ctree(struct super_block *sb, fs_info->block_group_cache_tree = RB_ROOT; fs_info->first_logical_byte = (u64)-1; - extent_io_tree_init(&fs_info->freed_extents[0], NULL); - extent_io_tree_init(&fs_info->freed_extents[1], NULL); + extent_io_tree_init(fs_info, &fs_info->freed_extents[0], + IO_TREE_FS_INFO_FREED_EXTENTS0, NULL); + extent_io_tree_init(fs_info, &fs_info->freed_extents[1], + IO_TREE_FS_INFO_FREED_EXTENTS1, NULL); fs_info->pinned_extents = &fs_info->freed_extents[0]; set_bit(BTRFS_FS_BARRIER, &fs_info->flags); @@ -2776,8 +2785,6 @@ int open_ctree(struct super_block *sb, init_waitqueue_head(&fs_info->async_submit_wait); init_waitqueue_head(&fs_info->delayed_iputs_wait); - INIT_LIST_HEAD(&fs_info->pinned_chunks); - /* Usable values until the real ones are cached from the superblock */ fs_info->nodesize = 4096; fs_info->sectorsize = 4096; @@ -3335,6 +3342,8 @@ fail_delalloc_bytes: percpu_counter_destroy(&fs_info->delalloc_bytes); fail_dirty_metadata_bytes: percpu_counter_destroy(&fs_info->dirty_metadata_bytes); +fail_dio_bytes: + percpu_counter_destroy(&fs_info->dio_bytes); fail_srcu: cleanup_srcu_struct(&fs_info->subvol_srcu); fail: @@ -4016,6 +4025,10 @@ void close_ctree(struct btrfs_fs_info *fs_info) percpu_counter_sum(&fs_info->delalloc_bytes)); } + if (percpu_counter_sum(&fs_info->dio_bytes)) + btrfs_info(fs_info, "at unmount dio bytes count %lld", + percpu_counter_sum(&fs_info->dio_bytes)); + btrfs_sysfs_remove_mounted(fs_info); btrfs_sysfs_remove_fsid(fs_info->fs_devices); @@ -4042,25 +4055,17 @@ void close_ctree(struct btrfs_fs_info *fs_info) btrfsic_unmount(fs_info->fs_devices); #endif - btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); + btrfs_close_devices(fs_info->fs_devices); percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); + percpu_counter_destroy(&fs_info->dio_bytes); percpu_counter_destroy(&fs_info->dev_replace.bio_counter); cleanup_srcu_struct(&fs_info->subvol_srcu); btrfs_free_stripe_hash_table(fs_info); btrfs_free_ref_cache(fs_info); - - while (!list_empty(&fs_info->pinned_chunks)) { - struct extent_map *em; - - em = list_first_entry(&fs_info->pinned_chunks, - struct extent_map, list); - list_del_init(&em->list); - free_extent_map(em); - } } int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, @@ -4114,7 +4119,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) * So here we should only check item pointers, not item data. */ if (btrfs_header_level(buf) == 0 && - btrfs_check_leaf_relaxed(fs_info, buf)) { + btrfs_check_leaf_relaxed(buf)) { btrfs_print_leaf(buf); ASSERT(0); } @@ -4157,10 +4162,7 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info) int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level, struct btrfs_key *first_key) { - struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root; - struct btrfs_fs_info *fs_info = root->fs_info; - - return btree_read_extent_buffer_pages(fs_info, buf, parent_transid, + return btree_read_extent_buffer_pages(buf, parent_transid, level, first_key); } @@ -4484,10 +4486,17 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans, void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, struct btrfs_fs_info *fs_info) { + struct btrfs_device *dev, *tmp; + btrfs_cleanup_dirty_bgs(cur_trans, fs_info); ASSERT(list_empty(&cur_trans->dirty_bgs)); ASSERT(list_empty(&cur_trans->io_bgs)); + list_for_each_entry_safe(dev, tmp, &cur_trans->dev_update_list, + post_commit_list) { + list_del_init(&dev->post_commit_list); + } + btrfs_destroy_delayed_refs(cur_trans, fs_info); cur_trans->state = TRANS_STATE_COMMIT_START; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 987a64bc0c66..a0161aa1ea0b 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -39,6 +39,8 @@ static inline u64 btrfs_sb_offset(int mirror) struct btrfs_device; struct btrfs_fs_devices; +int btrfs_verify_level_key(struct extent_buffer *eb, int level, + struct btrfs_key *first_key, u64 parent_transid); struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, u64 parent_transid, int level, struct btrfs_key *first_key); @@ -48,7 +50,7 @@ int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr, struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, u64 bytenr); -void clean_tree_block(struct btrfs_fs_info *fs_info, struct extent_buffer *buf); +void btrfs_clean_tree_block(struct extent_buffer *buf); int open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options); @@ -123,8 +125,6 @@ blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, extent_submit_bio_start_t *submit_bio_start); blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio, int mirror_num); -int btrfs_write_tree_block(struct extent_buffer *buf); -void btrfs_wait_tree_block_writeback(struct extent_buffer *buf); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_add_log_tree(struct btrfs_trans_handle *trans, @@ -134,7 +134,6 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *trans, void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 objectid); int btree_lock_page_hook(struct page *page, void *data, void (*flush_fn)(void *)); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c5880329ae37..f79e477a378e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -643,7 +643,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, if (btrfs_test_opt(fs_info, SPACE_CACHE)) { mutex_lock(&caching_ctl->mutex); - ret = load_free_space_cache(fs_info, cache); + ret = load_free_space_cache(cache); spin_lock(&cache->lock); if (ret == 1) { @@ -756,14 +756,15 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, return NULL; } -static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes, - bool metadata, u64 root_objectid) +static void add_pinned_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_ref *ref) { struct btrfs_space_info *space_info; + s64 num_bytes = -ref->len; u64 flags; - if (metadata) { - if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) + if (ref->type == BTRFS_REF_METADATA) { + if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID) flags = BTRFS_BLOCK_GROUP_SYSTEM; else flags = BTRFS_BLOCK_GROUP_METADATA; @@ -1704,7 +1705,7 @@ void setup_inline_extent_backref(struct btrfs_fs_info *fs_info, type = extent_ref_type(parent, owner); size = btrfs_extent_inline_ref_size(type); - btrfs_extend_item(fs_info, path, size); + btrfs_extend_item(path, size); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, ei); @@ -1779,7 +1780,6 @@ void update_inline_extent_backref(struct btrfs_path *path, int *last_ref) { struct extent_buffer *leaf = path->nodes[0]; - struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_extent_item *ei; struct btrfs_extent_data_ref *dref = NULL; struct btrfs_shared_data_ref *sref = NULL; @@ -1834,7 +1834,7 @@ void update_inline_extent_backref(struct btrfs_path *path, memmove_extent_buffer(leaf, ptr, ptr + size, end - ptr - size); item_size -= size; - btrfs_truncate_item(fs_info, path, item_size, 1); + btrfs_truncate_item(path, item_size, 1); } btrfs_mark_buffer_dirty(leaf); } @@ -1905,7 +1905,6 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, return ret; } -#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, u64 *discarded_bytes) { @@ -2043,39 +2042,28 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, /* Can return -ENOMEM */ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset) + struct btrfs_ref *generic_ref) { - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_fs_info *fs_info = trans->fs_info; int old_ref_mod, new_ref_mod; int ret; - BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && - root_objectid == BTRFS_TREE_LOG_OBJECTID); + ASSERT(generic_ref->type != BTRFS_REF_NOT_SET && + generic_ref->action); + BUG_ON(generic_ref->type == BTRFS_REF_METADATA && + generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID); - btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, root_objectid, - owner, offset, BTRFS_ADD_DELAYED_REF); - - if (owner < BTRFS_FIRST_FREE_OBJECTID) { - ret = btrfs_add_delayed_tree_ref(trans, bytenr, - num_bytes, parent, - root_objectid, (int)owner, - BTRFS_ADD_DELAYED_REF, NULL, - &old_ref_mod, &new_ref_mod); - } else { - ret = btrfs_add_delayed_data_ref(trans, bytenr, - num_bytes, parent, - root_objectid, owner, offset, - 0, BTRFS_ADD_DELAYED_REF, + if (generic_ref->type == BTRFS_REF_METADATA) + ret = btrfs_add_delayed_tree_ref(trans, generic_ref, + NULL, &old_ref_mod, &new_ref_mod); + else + ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0, &old_ref_mod, &new_ref_mod); - } - if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) { - bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID; + btrfs_ref_tree_mod(fs_info, generic_ref); - add_pinned_bytes(fs_info, -num_bytes, metadata, root_objectid); - } + if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) + add_pinned_bytes(fs_info, generic_ref); return ret; } @@ -2877,97 +2865,6 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) return btrfs_check_space_for_delayed_refs(trans->fs_info); } -struct async_delayed_refs { - struct btrfs_root *root; - u64 transid; - int count; - int error; - int sync; - struct completion wait; - struct btrfs_work work; -}; - -static inline struct async_delayed_refs * -to_async_delayed_refs(struct btrfs_work *work) -{ - return container_of(work, struct async_delayed_refs, work); -} - -static void delayed_ref_async_start(struct btrfs_work *work) -{ - struct async_delayed_refs *async = to_async_delayed_refs(work); - struct btrfs_trans_handle *trans; - struct btrfs_fs_info *fs_info = async->root->fs_info; - int ret; - - /* if the commit is already started, we don't need to wait here */ - if (btrfs_transaction_blocked(fs_info)) - goto done; - - trans = btrfs_join_transaction(async->root); - if (IS_ERR(trans)) { - async->error = PTR_ERR(trans); - goto done; - } - - /* - * trans->sync means that when we call end_transaction, we won't - * wait on delayed refs - */ - trans->sync = true; - - /* Don't bother flushing if we got into a different transaction */ - if (trans->transid > async->transid) - goto end; - - ret = btrfs_run_delayed_refs(trans, async->count); - if (ret) - async->error = ret; -end: - ret = btrfs_end_transaction(trans); - if (ret && !async->error) - async->error = ret; -done: - if (async->sync) - complete(&async->wait); - else - kfree(async); -} - -int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info, - unsigned long count, u64 transid, int wait) -{ - struct async_delayed_refs *async; - int ret; - - async = kmalloc(sizeof(*async), GFP_NOFS); - if (!async) - return -ENOMEM; - - async->root = fs_info->tree_root; - async->count = count; - async->error = 0; - async->transid = transid; - if (wait) - async->sync = 1; - else - async->sync = 0; - init_completion(&async->wait); - - btrfs_init_work(&async->work, btrfs_extent_refs_helper, - delayed_ref_async_start, NULL, NULL); - - btrfs_queue_work(fs_info->extent_workers, &async->work); - - if (wait) { - wait_for_completion(&async->wait); - ret = async->error; - kfree(async); - return ret; - } - return 0; -} - /* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be @@ -3036,7 +2933,6 @@ out: } int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 flags, int level, int is_data) { @@ -3053,8 +2949,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, extent_op->is_data = is_data ? true : false; extent_op->level = level; - ret = btrfs_add_delayed_extent_op(fs_info, trans, bytenr, - num_bytes, extent_op); + ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op); if (ret) btrfs_free_delayed_extent_op(extent_op); return ret; @@ -3246,13 +3141,12 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, u32 nritems; struct btrfs_key key; struct btrfs_file_extent_item *fi; + struct btrfs_ref generic_ref = { 0 }; + bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC); int i; + int action; int level; int ret = 0; - int (*process_func)(struct btrfs_trans_handle *, - struct btrfs_root *, - u64, u64, u64, u64, u64, u64); - if (btrfs_is_testing(fs_info)) return 0; @@ -3264,15 +3158,14 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0) return 0; - if (inc) - process_func = btrfs_inc_extent_ref; - else - process_func = btrfs_free_extent; - if (full_backref) parent = buf->start; else parent = 0; + if (inc) + action = BTRFS_ADD_DELAYED_REF; + else + action = BTRFS_DROP_DELAYED_REF; for (i = 0; i < nritems; i++) { if (level == 0) { @@ -3290,16 +3183,30 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); key.offset -= btrfs_file_extent_offset(buf, fi); - ret = process_func(trans, root, bytenr, num_bytes, - parent, ref_root, key.objectid, - key.offset); + btrfs_init_generic_ref(&generic_ref, action, bytenr, + num_bytes, parent); + generic_ref.real_root = root->root_key.objectid; + btrfs_init_data_ref(&generic_ref, ref_root, key.objectid, + key.offset); + generic_ref.skip_qgroup = for_reloc; + if (inc) + ret = btrfs_inc_extent_ref(trans, &generic_ref); + else + ret = btrfs_free_extent(trans, &generic_ref); if (ret) goto fail; } else { bytenr = btrfs_node_blockptr(buf, i); num_bytes = fs_info->nodesize; - ret = process_func(trans, root, bytenr, num_bytes, - parent, ref_root, level - 1, 0); + btrfs_init_generic_ref(&generic_ref, action, bytenr, + num_bytes, parent); + generic_ref.real_root = root->root_key.objectid; + btrfs_init_tree_ref(&generic_ref, level - 1, ref_root); + generic_ref.skip_qgroup = for_reloc; + if (inc) + ret = btrfs_inc_extent_ref(trans, &generic_ref); + else + ret = btrfs_free_extent(trans, &generic_ref); if (ret) goto fail; } @@ -3322,10 +3229,10 @@ int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, } static int write_one_cache_group(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_path *path, struct btrfs_block_group_cache *cache) { + struct btrfs_fs_info *fs_info = trans->fs_info; int ret; struct btrfs_root *extent_root = fs_info->extent_root; unsigned long bi; @@ -3348,10 +3255,10 @@ fail: } -static struct btrfs_block_group_cache * -next_block_group(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *cache) +static struct btrfs_block_group_cache *next_block_group( + struct btrfs_block_group_cache *cache) { + struct btrfs_fs_info *fs_info = cache->fs_info; struct rb_node *node; spin_lock(&fs_info->block_group_cache_lock); @@ -3404,7 +3311,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, if (trans->aborted) return 0; again: - inode = lookup_free_space_inode(fs_info, block_group, path); + inode = lookup_free_space_inode(block_group, path); if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { ret = PTR_ERR(inode); btrfs_release_path(path); @@ -3418,8 +3325,7 @@ again: if (block_group->ro) goto out_free; - ret = create_free_space_inode(fs_info, trans, block_group, - path); + ret = create_free_space_inode(trans, block_group, path); if (ret) goto out_free; goto again; @@ -3538,9 +3444,9 @@ out: return ret; } -int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +int btrfs_setup_space_cache(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group_cache *cache, *tmp; struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_path *path; @@ -3652,8 +3558,7 @@ again: if (cache->disk_cache_state == BTRFS_DC_SETUP) { cache->io_ctl.inode = NULL; - ret = btrfs_write_out_cache(fs_info, trans, - cache, path); + ret = btrfs_write_out_cache(trans, cache, path); if (ret == 0 && cache->io_ctl.inode) { num_started++; should_put = 0; @@ -3673,8 +3578,7 @@ again: } } if (!ret) { - ret = write_one_cache_group(trans, fs_info, - path, cache); + ret = write_one_cache_group(trans, path, cache); /* * Our block group might still be attached to the list * of new block groups in the transaction handle of some @@ -3744,9 +3648,9 @@ again: return ret; } -int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group_cache *cache; struct btrfs_transaction *cur_trans = trans->transaction; int ret = 0; @@ -3809,8 +3713,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { cache->io_ctl.inode = NULL; - ret = btrfs_write_out_cache(fs_info, trans, - cache, path); + ret = btrfs_write_out_cache(trans, cache, path); if (ret == 0 && cache->io_ctl.inode) { num_started++; should_put = 0; @@ -3824,8 +3727,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, } } if (!ret) { - ret = write_one_cache_group(trans, fs_info, - path, cache); + ret = write_one_cache_group(trans, path, cache); /* * One of the free space endio workers might have * created a new block group while updating a free space @@ -3842,8 +3744,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, if (ret == -ENOENT) { wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); - ret = write_one_cache_group(trans, fs_info, - path, cache); + ret = write_one_cache_group(trans, path, cache); } if (ret) btrfs_abort_transaction(trans, ret); @@ -4732,6 +4633,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, struct btrfs_space_info *space_info; struct btrfs_trans_handle *trans; u64 delalloc_bytes; + u64 dio_bytes; u64 async_pages; u64 items; long time_left; @@ -4747,7 +4649,8 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, delalloc_bytes = percpu_counter_sum_positive( &fs_info->delalloc_bytes); - if (delalloc_bytes == 0) { + dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); + if (delalloc_bytes == 0 && dio_bytes == 0) { if (trans) return; if (wait_ordered) @@ -4755,8 +4658,16 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, return; } + /* + * If we are doing more ordered than delalloc we need to just wait on + * ordered extents, otherwise we'll waste time trying to flush delalloc + * that likely won't give us the space back we need. + */ + if (dio_bytes > delalloc_bytes) + wait_ordered = true; + loops = 0; - while (delalloc_bytes && loops < 3) { + while ((delalloc_bytes || dio_bytes) && loops < 3) { nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; /* @@ -4806,6 +4717,7 @@ skip_async: } delalloc_bytes = percpu_counter_sum_positive( &fs_info->delalloc_bytes); + dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); } } @@ -5803,85 +5715,6 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, return ret; } -static void calc_refill_bytes(struct btrfs_block_rsv *block_rsv, - u64 *metadata_bytes, u64 *qgroup_bytes) -{ - *metadata_bytes = 0; - *qgroup_bytes = 0; - - spin_lock(&block_rsv->lock); - if (block_rsv->reserved < block_rsv->size) - *metadata_bytes = block_rsv->size - block_rsv->reserved; - if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size) - *qgroup_bytes = block_rsv->qgroup_rsv_size - - block_rsv->qgroup_rsv_reserved; - spin_unlock(&block_rsv->lock); -} - -/** - * btrfs_inode_rsv_refill - refill the inode block rsv. - * @inode - the inode we are refilling. - * @flush - the flushing restriction. - * - * Essentially the same as btrfs_block_rsv_refill, except it uses the - * block_rsv->size as the minimum size. We'll either refill the missing amount - * or return if we already have enough space. This will also handle the reserve - * tracepoint for the reserved amount. - */ -static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, - enum btrfs_reserve_flush_enum flush) -{ - struct btrfs_root *root = inode->root; - struct btrfs_block_rsv *block_rsv = &inode->block_rsv; - u64 num_bytes, last = 0; - u64 qgroup_num_bytes; - int ret = -ENOSPC; - - calc_refill_bytes(block_rsv, &num_bytes, &qgroup_num_bytes); - if (num_bytes == 0) - return 0; - - do { - ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, - true); - if (ret) - return ret; - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); - if (ret) { - btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); - last = num_bytes; - /* - * If we are fragmented we can end up with a lot of - * outstanding extents which will make our size be much - * larger than our reserved amount. - * - * If the reservation happens here, it might be very - * big though not needed in the end, if the delalloc - * flushing happens. - * - * If this is the case try and do the reserve again. - */ - if (flush == BTRFS_RESERVE_FLUSH_ALL) - calc_refill_bytes(block_rsv, &num_bytes, - &qgroup_num_bytes); - if (num_bytes == 0) - return 0; - } - } while (ret && last != num_bytes); - - if (!ret) { - block_rsv_add_bytes(block_rsv, num_bytes, false); - trace_btrfs_space_reservation(root->fs_info, "delalloc", - btrfs_ino(inode), num_bytes, 1); - - /* Don't forget to increase qgroup_rsv_reserved */ - spin_lock(&block_rsv->lock); - block_rsv->qgroup_rsv_reserved += qgroup_num_bytes; - spin_unlock(&block_rsv->lock); - } - return ret; -} - static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 num_bytes, u64 *qgroup_to_release) @@ -6182,9 +6015,25 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, spin_unlock(&block_rsv->lock); } +static void calc_inode_reservations(struct btrfs_fs_info *fs_info, + u64 num_bytes, u64 *meta_reserve, + u64 *qgroup_reserve) +{ + u64 nr_extents = count_max_extents(num_bytes); + u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes); + + /* We add one for the inode update at finish ordered time */ + *meta_reserve = btrfs_calc_trans_metadata_size(fs_info, + nr_extents + csum_leaves + 1); + *qgroup_reserve = nr_extents * fs_info->nodesize; +} + int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *block_rsv = &inode->block_rsv; + u64 meta_reserve, qgroup_reserve; unsigned nr_extents; enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; int ret = 0; @@ -6214,7 +6063,31 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) num_bytes = ALIGN(num_bytes, fs_info->sectorsize); - /* Add our new extents and calculate the new rsv size. */ + /* + * We always want to do it this way, every other way is wrong and ends + * in tears. Pre-reserving the amount we are going to add will always + * be the right way, because otherwise if we have enough parallelism we + * could end up with thousands of inodes all holding little bits of + * reservations they were able to make previously and the only way to + * reclaim that space is to ENOSPC out the operations and clear + * everything out and try again, which is bad. This way we just + * over-reserve slightly, and clean up the mess when we are done. + */ + calc_inode_reservations(fs_info, num_bytes, &meta_reserve, + &qgroup_reserve); + ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true); + if (ret) + goto out_fail; + ret = reserve_metadata_bytes(root, block_rsv, meta_reserve, flush); + if (ret) + goto out_qgroup; + + /* + * Now we need to update our outstanding extents and csum bytes _first_ + * and then add the reservation to the block_rsv. This keeps us from + * racing with an ordered completion or some such that would think it + * needs to free the reservation we just made. + */ spin_lock(&inode->lock); nr_extents = count_max_extents(num_bytes); btrfs_mod_outstanding_extents(inode, nr_extents); @@ -6222,22 +6095,21 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) btrfs_calculate_inode_block_rsv_size(fs_info, inode); spin_unlock(&inode->lock); - ret = btrfs_inode_rsv_refill(inode, flush); - if (unlikely(ret)) - goto out_fail; + /* Now we can safely add our space to our block rsv */ + block_rsv_add_bytes(block_rsv, meta_reserve, false); + trace_btrfs_space_reservation(root->fs_info, "delalloc", + btrfs_ino(inode), meta_reserve, 1); + + spin_lock(&block_rsv->lock); + block_rsv->qgroup_rsv_reserved += qgroup_reserve; + spin_unlock(&block_rsv->lock); if (delalloc_lock) mutex_unlock(&inode->delalloc_mutex); return 0; - +out_qgroup: + btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve); out_fail: - spin_lock(&inode->lock); - nr_extents = count_max_extents(num_bytes); - btrfs_mod_outstanding_extents(inode, -nr_extents); - inode->csum_bytes -= num_bytes; - btrfs_calculate_inode_block_rsv_size(fs_info, inode); - spin_unlock(&inode->lock); - btrfs_inode_rsv_release(inode, true); if (delalloc_lock) mutex_unlock(&inode->delalloc_mutex); @@ -6361,9 +6233,9 @@ void btrfs_delalloc_release_space(struct inode *inode, } static int update_block_group(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *info, u64 bytenr, - u64 num_bytes, int alloc) + u64 bytenr, u64 num_bytes, int alloc) { + struct btrfs_fs_info *info = trans->fs_info; struct btrfs_block_group_cache *cache = NULL; u64 total = num_bytes; u64 old_val; @@ -6444,7 +6316,6 @@ static int update_block_group(struct btrfs_trans_handle *trans, if (list_empty(&cache->dirty_list)) { list_add_tail(&cache->dirty_list, &trans->transaction->dirty_bgs); - trans->transaction->num_dirty_bgs++; trans->delayed_ref_updates++; btrfs_get_block_group(cache); } @@ -6491,10 +6362,11 @@ static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) return bytenr; } -static int pin_down_extent(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *cache, +static int pin_down_extent(struct btrfs_block_group_cache *cache, u64 bytenr, u64 num_bytes, int reserved) { + struct btrfs_fs_info *fs_info = cache->fs_info; + spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); cache->pinned += num_bytes; @@ -6526,7 +6398,7 @@ int btrfs_pin_extent(struct btrfs_fs_info *fs_info, cache = btrfs_lookup_block_group(fs_info, bytenr); BUG_ON(!cache); /* Logic error */ - pin_down_extent(fs_info, cache, bytenr, num_bytes, reserved); + pin_down_extent(cache, bytenr, num_bytes, reserved); btrfs_put_block_group(cache); return 0; @@ -6553,7 +6425,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, */ cache_block_group(cache, 1); - pin_down_extent(fs_info, cache, bytenr, num_bytes, 0); + pin_down_extent(cache, bytenr, num_bytes, 0); /* remove us from the free space cache (if we're there at all) */ ret = btrfs_remove_free_space(cache, bytenr, num_bytes); @@ -6607,9 +6479,9 @@ out_lock: return ret; } -int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb) +int btrfs_exclude_logged_extents(struct extent_buffer *eb) { + struct btrfs_fs_info *fs_info = eb->fs_info; struct btrfs_file_extent_item *item; struct btrfs_key key; int found_type; @@ -7198,7 +7070,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, goto out; } - ret = update_block_group(trans, info, bytenr, num_bytes, 0); + ret = update_block_group(trans, bytenr, num_bytes, 0); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -7272,21 +7144,20 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, u64 parent, int last_ref) { struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_ref generic_ref = { 0 }; int pin = 1; int ret; + btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, + buf->start, buf->len, parent); + btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), + root->root_key.objectid); + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { int old_ref_mod, new_ref_mod; - btrfs_ref_tree_mod(root, buf->start, buf->len, parent, - root->root_key.objectid, - btrfs_header_level(buf), 0, - BTRFS_DROP_DELAYED_REF); - ret = btrfs_add_delayed_tree_ref(trans, buf->start, - buf->len, parent, - root->root_key.objectid, - btrfs_header_level(buf), - BTRFS_DROP_DELAYED_REF, NULL, + btrfs_ref_tree_mod(fs_info, &generic_ref); + ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL, &old_ref_mod, &new_ref_mod); BUG_ON(ret); /* -ENOMEM */ pin = old_ref_mod >= 0 && new_ref_mod < 0; @@ -7305,8 +7176,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, cache = btrfs_lookup_block_group(fs_info, buf->start); if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { - pin_down_extent(fs_info, cache, buf->start, - buf->len, 1); + pin_down_extent(cache, buf->start, buf->len, 1); btrfs_put_block_group(cache); goto out; } @@ -7320,8 +7190,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, } out: if (pin) - add_pinned_bytes(fs_info, buf->len, true, - root->root_key.objectid); + add_pinned_bytes(fs_info, &generic_ref); if (last_ref) { /* @@ -7333,52 +7202,43 @@ out: } /* Can return -ENOMEM */ -int btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, - u64 owner, u64 offset) +int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) { - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_fs_info *fs_info = trans->fs_info; int old_ref_mod, new_ref_mod; int ret; if (btrfs_is_testing(fs_info)) return 0; - if (root_objectid != BTRFS_TREE_LOG_OBJECTID) - btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, - root_objectid, owner, offset, - BTRFS_DROP_DELAYED_REF); - /* * tree log blocks never actually go into the extent allocation * tree, just update pinning info and exit early. */ - if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { - WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); + if ((ref->type == BTRFS_REF_METADATA && + ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) || + (ref->type == BTRFS_REF_DATA && + ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) { /* unlocks the pinned mutex */ - btrfs_pin_extent(fs_info, bytenr, num_bytes, 1); + btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1); old_ref_mod = new_ref_mod = 0; ret = 0; - } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { - ret = btrfs_add_delayed_tree_ref(trans, bytenr, - num_bytes, parent, - root_objectid, (int)owner, - BTRFS_DROP_DELAYED_REF, NULL, + } else if (ref->type == BTRFS_REF_METADATA) { + ret = btrfs_add_delayed_tree_ref(trans, ref, NULL, &old_ref_mod, &new_ref_mod); } else { - ret = btrfs_add_delayed_data_ref(trans, bytenr, - num_bytes, parent, - root_objectid, owner, offset, - 0, BTRFS_DROP_DELAYED_REF, + ret = btrfs_add_delayed_data_ref(trans, ref, 0, &old_ref_mod, &new_ref_mod); } - if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) { - bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID; + if (!((ref->type == BTRFS_REF_METADATA && + ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) || + (ref->type == BTRFS_REF_DATA && + ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID))) + btrfs_ref_tree_mod(fs_info, ref); - add_pinned_bytes(fs_info, num_bytes, metadata, root_objectid); - } + if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) + add_pinned_bytes(fs_info, ref); return ret; } @@ -7569,7 +7429,6 @@ static int find_free_extent_clustered(struct btrfs_block_group_cache *bg, struct find_free_extent_ctl *ffe_ctl, struct btrfs_block_group_cache **cluster_bg_ret) { - struct btrfs_fs_info *fs_info = bg->fs_info; struct btrfs_block_group_cache *cluster_bg; u64 aligned_cluster; u64 offset; @@ -7629,9 +7488,8 @@ refill_cluster: aligned_cluster = max_t(u64, ffe_ctl->empty_cluster + ffe_ctl->empty_size, bg->full_stripe_len); - ret = btrfs_find_space_cluster(fs_info, bg, last_ptr, - ffe_ctl->search_start, ffe_ctl->num_bytes, - aligned_cluster); + ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start, + ffe_ctl->num_bytes, aligned_cluster); if (ret == 0) { /* Now pull our allocation out of this cluster */ offset = btrfs_alloc_from_cluster(bg, last_ptr, @@ -8281,7 +8139,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, } if (pin) - pin_down_extent(fs_info, cache, start, len, 1); + pin_down_extent(cache, start, len, 1); else { if (btrfs_test_opt(fs_info, DISCARD)) ret = btrfs_discard_extent(fs_info, start, len, NULL); @@ -8370,7 +8228,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = update_block_group(trans, fs_info, ins->objectid, ins->offset, 1); + ret = update_block_group(trans, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", ins->objectid, ins->offset); @@ -8460,7 +8318,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = update_block_group(trans, fs_info, extent_key.objectid, + ret = update_block_group(trans, extent_key.objectid, fs_info->nodesize, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", @@ -8478,19 +8336,17 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 offset, u64 ram_bytes, struct btrfs_key *ins) { + struct btrfs_ref generic_ref = { 0 }; int ret; BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); - btrfs_ref_tree_mod(root, ins->objectid, ins->offset, 0, - root->root_key.objectid, owner, offset, - BTRFS_ADD_DELAYED_EXTENT); - - ret = btrfs_add_delayed_data_ref(trans, ins->objectid, - ins->offset, 0, - root->root_key.objectid, owner, - offset, ram_bytes, - BTRFS_ADD_DELAYED_EXTENT, NULL, NULL); + btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, + ins->objectid, ins->offset, 0); + btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset); + btrfs_ref_tree_mod(root->fs_info, &generic_ref); + ret = btrfs_add_delayed_data_ref(trans, &generic_ref, + ram_bytes, NULL, NULL); return ret; } @@ -8563,7 +8419,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); btrfs_tree_lock(buf); - clean_tree_block(fs_info, buf); + btrfs_clean_tree_block(buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); btrfs_set_lock_blocking_write(buf); @@ -8682,6 +8538,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_block_rsv *block_rsv; struct extent_buffer *buf; struct btrfs_delayed_extent_op *extent_op; + struct btrfs_ref generic_ref = { 0 }; u64 flags = 0; int ret; u32 blocksize = fs_info->nodesize; @@ -8736,13 +8593,12 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, extent_op->is_data = false; extent_op->level = level; - btrfs_ref_tree_mod(root, ins.objectid, ins.offset, parent, - root_objectid, level, 0, - BTRFS_ADD_DELAYED_EXTENT); - ret = btrfs_add_delayed_tree_ref(trans, ins.objectid, - ins.offset, parent, - root_objectid, level, - BTRFS_ADD_DELAYED_EXTENT, + btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, + ins.objectid, ins.offset, parent); + generic_ref.real_root = root->root_key.objectid; + btrfs_init_tree_ref(&generic_ref, level, root_objectid); + btrfs_ref_tree_mod(fs_info, &generic_ref); + ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op, NULL, NULL); if (ret) goto out_free_delayed; @@ -8918,7 +8774,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, BUG_ON(ret); /* -ENOMEM */ ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_set_disk_extent_flags(trans, fs_info, eb->start, + ret = btrfs_set_disk_extent_flags(trans, eb->start, eb->len, flag, btrfs_header_level(eb), 0); BUG_ON(ret); /* -ENOMEM */ @@ -8987,6 +8843,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, u64 parent; struct btrfs_key key; struct btrfs_key first_key; + struct btrfs_ref ref = { 0 }; struct extent_buffer *next; int level = wc->level; int reada = 0; @@ -9159,9 +9016,10 @@ skip: wc->drop_level = level; find_next_key(path, level, &wc->drop_progress); - ret = btrfs_free_extent(trans, root, bytenr, fs_info->nodesize, - parent, root->root_key.objectid, - level - 1, 0); + btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, + fs_info->nodesize, parent); + btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid); + ret = btrfs_free_extent(trans, &ref); if (ret) goto out_unlock; } @@ -9251,21 +9109,23 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, else ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_qgroup_trace_leaf_items(trans, eb); - if (ret) { - btrfs_err_rl(fs_info, - "error %d accounting leaf items. Quota is out of sync, rescan required.", + if (is_fstree(root->root_key.objectid)) { + ret = btrfs_qgroup_trace_leaf_items(trans, eb); + if (ret) { + btrfs_err_rl(fs_info, + "error %d accounting leaf items, quota is out of sync, rescan required", ret); + } } } - /* make block locked assertion in clean_tree_block happy */ + /* make block locked assertion in btrfs_clean_tree_block happy */ if (!path->locks[level] && btrfs_header_generation(eb) == trans->transid) { btrfs_tree_lock(eb); btrfs_set_lock_blocking_write(eb); path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; } - clean_tree_block(fs_info, eb); + btrfs_clean_tree_block(eb); } if (eb == root->node) { @@ -9921,12 +9781,10 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache) */ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) { - struct btrfs_root *root = fs_info->extent_root; struct btrfs_block_group_cache *block_group; struct btrfs_space_info *space_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; - struct btrfs_trans_handle *trans; u64 min_free; u64 dev_min = 1; u64 dev_nr = 0; @@ -10025,13 +9883,6 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) min_free = div64_u64(min_free, dev_min); } - /* We need to do this so that we can look at pending chunks */ - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out; - } - mutex_lock(&fs_info->chunk_mutex); list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { u64 dev_offset; @@ -10042,7 +9893,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) */ if (device->total_bytes > device->bytes_used + min_free && !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { - ret = find_free_dev_extent(trans, device, min_free, + ret = find_free_dev_extent(device, min_free, &dev_offset, NULL); if (!ret) dev_nr++; @@ -10058,7 +9909,6 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) "no space to allocate a new chunk for block group %llu", block_group->key.objectid); mutex_unlock(&fs_info->chunk_mutex); - btrfs_end_transaction(trans); out: btrfs_put_block_group(block_group); return ret; @@ -10159,7 +10009,7 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info) if (block_group->iref) break; spin_unlock(&block_group->lock); - block_group = next_block_group(info, block_group); + block_group = next_block_group(block_group); } if (!block_group) { if (last == 0) @@ -10660,7 +10510,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, struct btrfs_block_group_cache *cache; int ret; - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size); if (!cache) @@ -10808,7 +10658,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * get the inode first so any iput calls done for the io_list * aren't the final iput (no unlinks allowed now) */ - inode = lookup_free_space_inode(fs_info, block_group, path); + inode = lookup_free_space_inode(block_group, path); mutex_lock(&trans->transaction->cache_write_mutex); /* @@ -10952,10 +10802,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, memcpy(&key, &block_group->key, sizeof(key)); mutex_lock(&fs_info->chunk_mutex); - if (!list_empty(&em->list)) { - /* We're in the transaction->pending_chunks list. */ - free_extent_map(em); - } spin_lock(&block_group->lock); block_group->removed = 1; /* @@ -10982,25 +10828,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * the transaction commit has completed. */ remove_em = (atomic_read(&block_group->trimming) == 0); - /* - * Make sure a trimmer task always sees the em in the pinned_chunks list - * if it sees block_group->removed == 1 (needs to lock block_group->lock - * before checking block_group->removed). - */ - if (!remove_em) { - /* - * Our em might be in trans->transaction->pending_chunks which - * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks), - * and so is the fs_info->pinned_chunks list. - * - * So at this point we must be holding the chunk_mutex to avoid - * any races with chunk allocation (more specifically at - * volumes.c:contains_pending_extent()), to ensure it always - * sees the em, either in the pending_chunks list or in the - * pinned_chunks list. - */ - list_move_tail(&em->list, &fs_info->pinned_chunks); - } spin_unlock(&block_group->lock); if (remove_em) { @@ -11008,11 +10835,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, em_tree = &fs_info->mapping_tree.map_tree; write_lock(&em_tree->lock); - /* - * The em might be in the pending_chunks list, so make sure the - * chunk mutex is locked, since remove_extent_mapping() will - * delete us from that list. - */ remove_extent_mapping(em_tree, em); write_unlock(&em_tree->lock); /* once for the tree */ @@ -11315,11 +11137,12 @@ int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, * held back allocations. */ static int btrfs_trim_free_extents(struct btrfs_device *device, - u64 minlen, u64 *trimmed) + struct fstrim_range *range, u64 *trimmed) { - u64 start = 0, len = 0; + u64 start, len = 0, end = 0; int ret; + start = max_t(u64, range->start, SZ_1M); *trimmed = 0; /* Discard not supported = nothing to do. */ @@ -11338,43 +11161,52 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, while (1) { struct btrfs_fs_info *fs_info = device->fs_info; - struct btrfs_transaction *trans; u64 bytes; ret = mutex_lock_interruptible(&fs_info->chunk_mutex); if (ret) break; - ret = down_read_killable(&fs_info->commit_root_sem); - if (ret) { + find_first_clear_extent_bit(&device->alloc_state, start, + &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); + /* + * If find_first_clear_extent_bit find a range that spans the + * end of the device it will set end to -1, in this case it's up + * to the caller to trim the value to the size of the device. + */ + end = min(end, device->total_bytes - 1); + len = end - start + 1; + + /* We didn't find any extents */ + if (!len) { mutex_unlock(&fs_info->chunk_mutex); + ret = 0; break; } - spin_lock(&fs_info->trans_lock); - trans = fs_info->running_transaction; - if (trans) - refcount_inc(&trans->use_count); - spin_unlock(&fs_info->trans_lock); - - if (!trans) - up_read(&fs_info->commit_root_sem); - - ret = find_free_dev_extent_start(trans, device, minlen, start, - &start, &len); - if (trans) { - up_read(&fs_info->commit_root_sem); - btrfs_put_transaction(trans); + /* Keep going until we satisfy minlen or reach end of space */ + if (len < range->minlen) { + mutex_unlock(&fs_info->chunk_mutex); + start += len; + continue; } - if (ret) { + /* If we are out of the passed range break */ + if (start > range->start + range->len - 1) { mutex_unlock(&fs_info->chunk_mutex); - if (ret == -ENOSPC) - ret = 0; break; } - ret = btrfs_issue_discard(device->bdev, start, len, &bytes); + start = max(range->start, start); + len = min(range->len, len); + + ret = btrfs_issue_discard(device->bdev, start, len, + &bytes); + if (!ret) + set_extent_bits(&device->alloc_state, start, + start + bytes - 1, + CHUNK_TRIMMED); mutex_unlock(&fs_info->chunk_mutex); if (ret) @@ -11383,6 +11215,10 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, start += len; *trimmed += bytes; + /* We've trimmed enough */ + if (*trimmed >= range->len) + break; + if (fatal_signal_pending(current)) { ret = -ERESTARTSYS; break; @@ -11419,7 +11255,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) int ret = 0; cache = btrfs_lookup_first_block_group(fs_info, range->start); - for (; cache; cache = next_block_group(fs_info, cache)) { + for (; cache; cache = next_block_group(cache)) { if (cache->key.objectid >= (range->start + range->len)) { btrfs_put_block_group(cache); break; @@ -11466,8 +11302,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) mutex_lock(&fs_info->fs_devices->device_list_mutex); devices = &fs_info->fs_devices->devices; list_for_each_entry(device, devices, dev_list) { - ret = btrfs_trim_free_extents(device, range->minlen, - &group_trimmed); + ret = btrfs_trim_free_extents(device, range, &group_trimmed); if (ret) { dev_failed++; dev_ret = ret; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ca8b8e785cf3..db337e53aab3 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -109,8 +109,6 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller, #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) #endif -#define BUFFER_LRU_MAX 64 - struct tree_entry { u64 start; u64 end; @@ -151,34 +149,51 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num, unsigned long bio_flags) { blk_status_t ret = 0; - struct bio_vec *bvec = bio_last_bvec_all(bio); - struct bio_vec bv; struct extent_io_tree *tree = bio->bi_private; - u64 start; - - mp_bvec_last_segment(bvec, &bv); - start = page_offset(bv.bv_page) + bv.bv_offset; bio->bi_private = NULL; if (tree->ops) ret = tree->ops->submit_bio_hook(tree->private_data, bio, - mirror_num, bio_flags, start); + mirror_num, bio_flags); else btrfsic_submit_bio(bio); return blk_status_to_errno(ret); } -static void flush_write_bio(struct extent_page_data *epd) +/* Cleanup unsubmitted bios */ +static void end_write_bio(struct extent_page_data *epd, int ret) { if (epd->bio) { - int ret; + epd->bio->bi_status = errno_to_blk_status(ret); + bio_endio(epd->bio); + epd->bio = NULL; + } +} +/* + * Submit bio from extent page data via submit_one_bio + * + * Return 0 if everything is OK. + * Return <0 for error. + */ +static int __must_check flush_write_bio(struct extent_page_data *epd) +{ + int ret = 0; + + if (epd->bio) { ret = submit_one_bio(epd->bio, 0, 0); - BUG_ON(ret < 0); /* -ENOMEM */ + /* + * Clean up of epd->bio is handled by its endio function. + * And endio is either triggered by successful bio execution + * or the error handler of submit bio hook. + * So at this point, no matter what happened, we don't need + * to clean up epd->bio. + */ epd->bio = NULL; } + return ret; } int __init extent_io_init(void) @@ -232,14 +247,46 @@ void __cold extent_io_exit(void) bioset_exit(&btrfs_bioset); } -void extent_io_tree_init(struct extent_io_tree *tree, +void extent_io_tree_init(struct btrfs_fs_info *fs_info, + struct extent_io_tree *tree, unsigned int owner, void *private_data) { + tree->fs_info = fs_info; tree->state = RB_ROOT; tree->ops = NULL; tree->dirty_bytes = 0; spin_lock_init(&tree->lock); tree->private_data = private_data; + tree->owner = owner; +} + +void extent_io_tree_release(struct extent_io_tree *tree) +{ + spin_lock(&tree->lock); + /* + * Do a single barrier for the waitqueue_active check here, the state + * of the waitqueue should not change once extent_io_tree_release is + * called. + */ + smp_mb(); + while (!RB_EMPTY_ROOT(&tree->state)) { + struct rb_node *node; + struct extent_state *state; + + node = rb_first(&tree->state); + state = rb_entry(node, struct extent_state, rb_node); + rb_erase(&state->rb_node, &tree->state); + RB_CLEAR_NODE(&state->rb_node); + /* + * btree io trees aren't supposed to have tasks waiting for + * changes in the flags of extent states ever. + */ + ASSERT(!waitqueue_active(&state->wq)); + free_extent_state(state); + + cond_resched_lock(&tree->lock); + } + spin_unlock(&tree->lock); } static struct extent_state *alloc_extent_state(gfp_t mask) @@ -400,7 +447,7 @@ static void merge_state(struct extent_io_tree *tree, struct extent_state *other; struct rb_node *other_node; - if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) + if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY)) return; other_node = rb_prev(&state->rb_node); @@ -611,6 +658,7 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int clear = 0; btrfs_debug_check_extent_io_range(tree, start, end); + trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits); if (bits & EXTENT_DELALLOC) bits |= EXTENT_NORESERVE; @@ -618,7 +666,7 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, if (delete) bits |= ~EXTENT_CTLBITS; - if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) + if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)) clear = 1; again: if (!prealloc && gfpflags_allow_blocking(mask)) { @@ -850,7 +898,7 @@ static void cache_state(struct extent_state *state, struct extent_state **cached_ptr) { return cache_state_if_flags(state, cached_ptr, - EXTENT_IOBITS | EXTENT_BOUNDARY); + EXTENT_LOCKED | EXTENT_BOUNDARY); } /* @@ -880,6 +928,7 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u64 last_end; btrfs_debug_check_extent_io_range(tree, start, end); + trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits); again: if (!prealloc && gfpflags_allow_blocking(mask)) { @@ -1112,6 +1161,8 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, bool first_iteration = true; btrfs_debug_check_extent_io_range(tree, start, end); + trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits, + clear_bits); again: if (!prealloc) { @@ -1311,6 +1362,13 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, changeset); } +int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits) +{ + return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, + GFP_NOWAIT, NULL); +} + int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, int wake, int delete, struct extent_state **cached) @@ -1478,6 +1536,79 @@ out: return ret; } +/** + * find_first_clear_extent_bit - finds the first range that has @bits not set + * and that starts after @start + * + * @tree - the tree to search + * @start - the offset at/after which the found extent should start + * @start_ret - records the beginning of the range + * @end_ret - records the end of the range (inclusive) + * @bits - the set of bits which must be unset + * + * Since unallocated range is also considered one which doesn't have the bits + * set it's possible that @end_ret contains -1, this happens in case the range + * spans (last_range_end, end of device]. In this case it's up to the caller to + * trim @end_ret to the appropriate size. + */ +void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, unsigned bits) +{ + struct extent_state *state; + struct rb_node *node, *prev = NULL, *next; + + spin_lock(&tree->lock); + + /* Find first extent with bits cleared */ + while (1) { + node = __etree_search(tree, start, &next, &prev, NULL, NULL); + if (!node) { + node = next; + if (!node) { + /* + * We are past the last allocated chunk, + * set start at the end of the last extent. The + * device alloc tree should never be empty so + * prev is always set. + */ + ASSERT(prev); + state = rb_entry(prev, struct extent_state, rb_node); + *start_ret = state->end + 1; + *end_ret = -1; + goto out; + } + } + state = rb_entry(node, struct extent_state, rb_node); + if (in_range(start, state->start, state->end - state->start + 1) && + (state->state & bits)) { + start = state->end + 1; + } else { + *start_ret = start; + break; + } + } + + /* + * Find the longest stretch from start until an entry which has the + * bits set + */ + while (1) { + state = rb_entry(node, struct extent_state, rb_node); + if (state->end >= start && !(state->state & bits)) { + *end_ret = state->end; + } else { + *end_ret = state->start - 1; + break; + } + + node = rb_next(node); + if (!node) + break; + } +out: + spin_unlock(&tree->lock); +} + /* * find a contiguous range of bytes in the file marked as delalloc, not * more than 'max_bytes'. start and end are used to return the range, @@ -2061,9 +2192,9 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, return 0; } -int repair_eb_io_failure(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int mirror_num) +int btrfs_repair_eb_io_failure(struct extent_buffer *eb, int mirror_num) { + struct btrfs_fs_info *fs_info = eb->fs_info; u64 start = eb->start; int i, num_pages = num_extent_pages(eb); int ret = 0; @@ -2409,7 +2540,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, read_mode, failrec->this_mirror, failrec->in_validation); status = tree->ops->submit_bio_hook(tree->private_data, bio, failrec->this_mirror, - failrec->bio_flags, 0); + failrec->bio_flags); if (status) { free_io_failure(failure_tree, tree, failrec); bio_put(bio); @@ -2451,11 +2582,10 @@ static void end_bio_extent_writepage(struct bio *bio) struct bio_vec *bvec; u64 start; u64 end; - int i; struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, i, iter_all) { + bio_for_each_segment_all(bvec, bio, iter_all) { struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2523,11 +2653,10 @@ static void end_bio_extent_readpage(struct bio *bio) u64 extent_len = 0; int mirror; int ret; - int i; struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, i, iter_all) { + bio_for_each_segment_all(bvec, bio, iter_all) { struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2607,8 +2736,6 @@ static void end_bio_extent_readpage(struct bio *bio) if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) btree_readahead_hook(eb, -EIO); - - ret = -EIO; } readpage_ok: if (likely(uptodate)) { @@ -3069,7 +3196,7 @@ out: return ret; } -static inline void __do_contiguous_readpages(struct extent_io_tree *tree, +static inline void contiguous_readpages(struct extent_io_tree *tree, struct page *pages[], int nr_pages, u64 start, u64 end, struct extent_map **em_cached, @@ -3100,46 +3227,6 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree, } } -static void __extent_readpages(struct extent_io_tree *tree, - struct page *pages[], - int nr_pages, - struct extent_map **em_cached, - struct bio **bio, unsigned long *bio_flags, - u64 *prev_em_start) -{ - u64 start = 0; - u64 end = 0; - u64 page_start; - int index; - int first_index = 0; - - for (index = 0; index < nr_pages; index++) { - page_start = page_offset(pages[index]); - if (!end) { - start = page_start; - end = start + PAGE_SIZE - 1; - first_index = index; - } else if (end + 1 == page_start) { - end += PAGE_SIZE; - } else { - __do_contiguous_readpages(tree, &pages[first_index], - index - first_index, start, - end, em_cached, - bio, bio_flags, - prev_em_start); - start = page_start; - end = start + PAGE_SIZE - 1; - first_index = index; - } - } - - if (end) - __do_contiguous_readpages(tree, &pages[first_index], - index - first_index, start, - end, em_cached, bio, - bio_flags, prev_em_start); -} - static int __extent_read_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, @@ -3419,6 +3506,9 @@ done: * records are inserted to lock ranges in the tree, and as dirty areas * are found, they are marked writeback. Then the lock bits are removed * and the end_io handler clears the writeback ranges + * + * Return 0 if everything goes well. + * Return <0 for error. */ static int __extent_writepage(struct page *page, struct writeback_control *wbc, struct extent_page_data *epd) @@ -3488,6 +3578,7 @@ done: end_extent_writepage(page, ret, start, page_end); } unlock_page(page); + ASSERT(ret <= 0); return ret; done_unlocked: @@ -3500,18 +3591,26 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb) TASK_UNINTERRUPTIBLE); } -static noinline_for_stack int -lock_extent_buffer_for_io(struct extent_buffer *eb, - struct btrfs_fs_info *fs_info, +/* + * Lock eb pages and flush the bio if we can't the locks + * + * Return 0 if nothing went wrong + * Return >0 is same as 0, except bio is not submitted + * Return <0 if something went wrong, no page is locked + */ +static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb, struct extent_page_data *epd) { - int i, num_pages; + struct btrfs_fs_info *fs_info = eb->fs_info; + int i, num_pages, failed_page_nr; int flush = 0; int ret = 0; if (!btrfs_try_tree_write_lock(eb)) { + ret = flush_write_bio(epd); + if (ret < 0) + return ret; flush = 1; - flush_write_bio(epd); btrfs_tree_lock(eb); } @@ -3520,7 +3619,9 @@ lock_extent_buffer_for_io(struct extent_buffer *eb, if (!epd->sync_io) return 0; if (!flush) { - flush_write_bio(epd); + ret = flush_write_bio(epd); + if (ret < 0) + return ret; flush = 1; } while (1) { @@ -3561,7 +3662,11 @@ lock_extent_buffer_for_io(struct extent_buffer *eb, if (!trylock_page(p)) { if (!flush) { - flush_write_bio(epd); + ret = flush_write_bio(epd); + if (ret < 0) { + failed_page_nr = i; + goto err_unlock; + } flush = 1; } lock_page(p); @@ -3569,6 +3674,11 @@ lock_extent_buffer_for_io(struct extent_buffer *eb, } return ret; +err_unlock: + /* Unlock already locked pages */ + for (i = 0; i < failed_page_nr; i++) + unlock_page(eb->pages[i]); + return ret; } static void end_extent_buffer_writeback(struct extent_buffer *eb) @@ -3643,11 +3753,11 @@ static void end_bio_extent_buffer_writepage(struct bio *bio) { struct bio_vec *bvec; struct extent_buffer *eb; - int i, done; + int done; struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, i, iter_all) { + bio_for_each_segment_all(bvec, bio, iter_all) { struct page *page = bvec->bv_page; eb = (struct extent_buffer *)page->private; @@ -3672,10 +3782,10 @@ static void end_bio_extent_buffer_writepage(struct bio *bio) } static noinline_for_stack int write_one_eb(struct extent_buffer *eb, - struct btrfs_fs_info *fs_info, struct writeback_control *wbc, struct extent_page_data *epd) { + struct btrfs_fs_info *fs_info = eb->fs_info; struct block_device *bdev = fs_info->fs_devices->latest_bdev; struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree; u64 offset = eb->start; @@ -3701,7 +3811,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 */ start = btrfs_item_nr_offset(nritems); - end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, eb); + end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb); memzero_extent_buffer(eb, start, end - start); } @@ -3744,7 +3854,6 @@ int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc) { struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; - struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; struct extent_buffer *eb, *prev_eb = NULL; struct extent_page_data epd = { .bio = NULL, @@ -3819,13 +3928,13 @@ retry: continue; prev_eb = eb; - ret = lock_extent_buffer_for_io(eb, fs_info, &epd); + ret = lock_extent_buffer_for_io(eb, &epd); if (!ret) { free_extent_buffer(eb); continue; } - ret = write_one_eb(eb, fs_info, wbc, &epd); + ret = write_one_eb(eb, wbc, &epd); if (ret) { done = 1; free_extent_buffer(eb); @@ -3852,7 +3961,12 @@ retry: index = 0; goto retry; } - flush_write_bio(&epd); + ASSERT(ret <= 0); + if (ret < 0) { + end_write_bio(&epd, ret); + return ret; + } + ret = flush_write_bio(&epd); return ret; } @@ -3949,7 +4063,8 @@ retry: * tmpfs file mapping */ if (!trylock_page(page)) { - flush_write_bio(epd); + ret = flush_write_bio(epd); + BUG_ON(ret < 0); lock_page(page); } @@ -3959,8 +4074,10 @@ retry: } if (wbc->sync_mode != WB_SYNC_NONE) { - if (PageWriteback(page)) - flush_write_bio(epd); + if (PageWriteback(page)) { + ret = flush_write_bio(epd); + BUG_ON(ret < 0); + } wait_on_page_writeback(page); } @@ -3971,11 +4088,6 @@ retry: } ret = __extent_writepage(page, wbc, epd); - - if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { - unlock_page(page); - ret = 0; - } if (ret < 0) { /* * done_index is set past this page, @@ -4029,8 +4141,14 @@ int extent_write_full_page(struct page *page, struct writeback_control *wbc) }; ret = __extent_writepage(page, wbc, &epd); + ASSERT(ret <= 0); + if (ret < 0) { + end_write_bio(&epd, ret); + return ret; + } - flush_write_bio(&epd); + ret = flush_write_bio(&epd); + ASSERT(ret <= 0); return ret; } @@ -4070,7 +4188,12 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, start += PAGE_SIZE; } - flush_write_bio(&epd); + ASSERT(ret <= 0); + if (ret < 0) { + end_write_bio(&epd, ret); + return ret; + } + ret = flush_write_bio(&epd); return ret; } @@ -4086,7 +4209,12 @@ int extent_writepages(struct address_space *mapping, }; ret = extent_write_cache_pages(mapping, wbc, &epd); - flush_write_bio(&epd); + ASSERT(ret <= 0); + if (ret < 0) { + end_write_bio(&epd, ret); + return ret; + } + ret = flush_write_bio(&epd); return ret; } @@ -4102,6 +4230,8 @@ int extent_readpages(struct address_space *mapping, struct list_head *pages, u64 prev_em_start = (u64)-1; while (!list_empty(pages)) { + u64 contig_end = 0; + for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) { struct page *page = lru_to_page(pages); @@ -4110,14 +4240,22 @@ int extent_readpages(struct address_space *mapping, struct list_head *pages, if (add_to_page_cache_lru(page, mapping, page->index, readahead_gfp_mask(mapping))) { put_page(page); - continue; + break; } pagepool[nr++] = page; + contig_end = page_offset(page) + PAGE_SIZE - 1; } - __extent_readpages(tree, pagepool, nr, &em_cached, &bio, - &bio_flags, &prev_em_start); + if (nr) { + u64 contig_start = page_offset(pagepool[0]); + + ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end); + + contiguous_readpages(tree, pagepool, nr, contig_start, + contig_end, &em_cached, &bio, &bio_flags, + &prev_em_start); + } } if (em_cached) @@ -4166,10 +4304,9 @@ static int try_release_extent_state(struct extent_io_tree *tree, u64 end = start + PAGE_SIZE - 1; int ret = 1; - if (test_range_bit(tree, start, end, - EXTENT_IOBITS, 0, NULL)) + if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) { ret = 0; - else { + } else { /* * at this point we can safely clear everything except the * locked bit and the nodatasum bit @@ -4222,8 +4359,7 @@ int try_release_extent_mapping(struct page *page, gfp_t mask) } if (!test_range_bit(tree, em->start, extent_map_end(em) - 1, - EXTENT_LOCKED | EXTENT_WRITEBACK, - 0, NULL)) { + EXTENT_LOCKED, 0, NULL)) { set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &btrfs_inode->runtime_flags); remove_extent_mapping(map, em); @@ -4372,8 +4508,7 @@ try_submit_last: * In this case, the first extent range will be cached but not emitted. * So we must emit it before ending extent_fiemap(). */ -static int emit_last_fiemap_cache(struct btrfs_fs_info *fs_info, - struct fiemap_extent_info *fieinfo, +static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo, struct fiemap_cache *cache) { int ret; @@ -4580,7 +4715,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } out_free: if (!ret) - ret = emit_last_fiemap_cache(root->fs_info, fieinfo, &cache); + ret = emit_last_fiemap_cache(fieinfo, &cache); free_extent_map(em); out: btrfs_free_path(path); @@ -4672,13 +4807,9 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, eb->fs_info = fs_info; eb->bflags = 0; rwlock_init(&eb->lock); - atomic_set(&eb->write_locks, 0); - atomic_set(&eb->read_locks, 0); atomic_set(&eb->blocking_readers, 0); atomic_set(&eb->blocking_writers, 0); - atomic_set(&eb->spinning_readers, 0); - atomic_set(&eb->spinning_writers, 0); - eb->lock_nested = 0; + eb->lock_nested = false; init_waitqueue_head(&eb->write_lock_wq); init_waitqueue_head(&eb->read_lock_wq); @@ -4695,6 +4826,13 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, > MAX_INLINE_EXTENT_BUFFER_SIZE); BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); +#ifdef CONFIG_BTRFS_DEBUG + atomic_set(&eb->spinning_writers, 0); + atomic_set(&eb->spinning_readers, 0); + atomic_set(&eb->read_locks, 0); + atomic_set(&eb->write_locks, 0); +#endif + return eb; } @@ -5183,8 +5321,7 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb) } } -int read_extent_buffer_pages(struct extent_io_tree *tree, - struct extent_buffer *eb, int wait, int mirror_num) +int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) { int i; struct page *page; @@ -5196,6 +5333,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, unsigned long num_reads = 0; struct bio *bio = NULL; unsigned long bio_flags = 0; + struct extent_io_tree *tree = &BTRFS_I(eb->fs_info->btree_inode)->io_tree; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; @@ -5746,13 +5884,13 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, btrfs_err(fs_info, "memmove bogus src_offset %lu move len %lu dst len %lu", src_offset, len, dst->len); - BUG_ON(1); + BUG(); } if (dst_offset + len > dst->len) { btrfs_err(fs_info, "memmove bogus dst_offset %lu move len %lu dst len %lu", dst_offset, len, dst->len); - BUG_ON(1); + BUG(); } while (len > 0) { @@ -5793,13 +5931,13 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, btrfs_err(fs_info, "memmove bogus src_offset %lu move len %lu len %lu", src_offset, len, dst->len); - BUG_ON(1); + BUG(); } if (dst_offset + len > dst->len) { btrfs_err(fs_info, "memmove bogus dst_offset %lu move len %lu len %lu", dst_offset, len, dst->len); - BUG_ON(1); + BUG(); } if (dst_offset < src_offset) { memcpy_extent_buffer(dst, dst_offset, src_offset, len); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 08749e0b9c32..aa18a16a6ed7 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -9,27 +9,34 @@ /* bits for the extent state */ #define EXTENT_DIRTY (1U << 0) -#define EXTENT_WRITEBACK (1U << 1) -#define EXTENT_UPTODATE (1U << 2) -#define EXTENT_LOCKED (1U << 3) -#define EXTENT_NEW (1U << 4) -#define EXTENT_DELALLOC (1U << 5) -#define EXTENT_DEFRAG (1U << 6) -#define EXTENT_BOUNDARY (1U << 9) -#define EXTENT_NODATASUM (1U << 10) -#define EXTENT_CLEAR_META_RESV (1U << 11) -#define EXTENT_NEED_WAIT (1U << 12) -#define EXTENT_DAMAGED (1U << 13) -#define EXTENT_NORESERVE (1U << 14) -#define EXTENT_QGROUP_RESERVED (1U << 15) -#define EXTENT_CLEAR_DATA_RESV (1U << 16) -#define EXTENT_DELALLOC_NEW (1U << 17) -#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) +#define EXTENT_UPTODATE (1U << 1) +#define EXTENT_LOCKED (1U << 2) +#define EXTENT_NEW (1U << 3) +#define EXTENT_DELALLOC (1U << 4) +#define EXTENT_DEFRAG (1U << 5) +#define EXTENT_BOUNDARY (1U << 6) +#define EXTENT_NODATASUM (1U << 7) +#define EXTENT_CLEAR_META_RESV (1U << 8) +#define EXTENT_NEED_WAIT (1U << 9) +#define EXTENT_DAMAGED (1U << 10) +#define EXTENT_NORESERVE (1U << 11) +#define EXTENT_QGROUP_RESERVED (1U << 12) +#define EXTENT_CLEAR_DATA_RESV (1U << 13) +#define EXTENT_DELALLOC_NEW (1U << 14) #define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \ EXTENT_CLEAR_DATA_RESV) #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING) /* + * Redefined bits above which are used only in the device allocation tree, + * shouldn't be using EXTENT_LOCKED / EXTENT_BOUNDARY / EXTENT_CLEAR_META_RESV + * / EXTENT_CLEAR_DATA_RESV because they have special meaning to the bit + * manipulation functions + */ +#define CHUNK_ALLOCATED EXTENT_DIRTY +#define CHUNK_TRIMMED EXTENT_DEFRAG + +/* * flags for bio submission. The high bits indicate the compression * type for this bio */ @@ -88,9 +95,6 @@ struct btrfs_inode; struct btrfs_io_bio; struct io_failure_record; -typedef blk_status_t (extent_submit_bio_hook_t)(void *private_data, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset); typedef blk_status_t (extent_submit_bio_start_t)(void *private_data, struct bio *bio, u64 bio_offset); @@ -100,17 +104,34 @@ struct extent_io_ops { * The following callbacks must be always defined, the function * pointer will be called unconditionally. */ - extent_submit_bio_hook_t *submit_bio_hook; + blk_status_t (*submit_bio_hook)(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags); int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset, struct page *page, u64 start, u64 end, int mirror); }; +enum { + IO_TREE_FS_INFO_FREED_EXTENTS0, + IO_TREE_FS_INFO_FREED_EXTENTS1, + IO_TREE_INODE_IO, + IO_TREE_INODE_IO_FAILURE, + IO_TREE_RELOC_BLOCKS, + IO_TREE_TRANS_DIRTY_PAGES, + IO_TREE_ROOT_DIRTY_LOG_PAGES, + IO_TREE_SELFTEST, +}; + struct extent_io_tree { struct rb_root state; + struct btrfs_fs_info *fs_info; void *private_data; u64 dirty_bytes; - int track_uptodate; + bool track_uptodate; + + /* Who owns this io tree, should be one of IO_TREE_* */ + u8 owner; + spinlock_t lock; const struct extent_io_ops *ops; }; @@ -146,14 +167,9 @@ struct extent_buffer { struct rcu_head rcu_head; pid_t lock_owner; - /* count of read lock holders on the extent buffer */ - atomic_t write_locks; - atomic_t read_locks; atomic_t blocking_writers; atomic_t blocking_readers; - atomic_t spinning_readers; - atomic_t spinning_writers; - short lock_nested; + bool lock_nested; /* >= 0 if eb belongs to a log tree, -1 otherwise */ short log_index; @@ -171,6 +187,10 @@ struct extent_buffer { wait_queue_head_t read_lock_wq; struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; #ifdef CONFIG_BTRFS_DEBUG + atomic_t spinning_writers; + atomic_t spinning_readers; + atomic_t read_locks; + atomic_t write_locks; struct list_head leak_list; #endif }; @@ -239,7 +259,10 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, u64 start, u64 len, int create); -void extent_io_tree_init(struct extent_io_tree *tree, void *private_data); +void extent_io_tree_init(struct btrfs_fs_info *fs_info, + struct extent_io_tree *tree, unsigned int owner, + void *private_data); +void extent_io_tree_release(struct extent_io_tree *tree); int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, @@ -309,6 +332,8 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, u64 *failed_start, struct extent_state **cached_state, gfp_t mask); +int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits); static inline int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits) @@ -376,6 +401,8 @@ static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start, int find_first_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, unsigned bits, struct extent_state **cached_state); +void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, unsigned bits); int extent_invalidatepage(struct extent_io_tree *tree, struct page *page, unsigned long offset); int extent_write_full_page(struct page *page, struct writeback_control *wbc); @@ -405,8 +432,7 @@ void free_extent_buffer_stale(struct extent_buffer *eb); #define WAIT_NONE 0 #define WAIT_COMPLETE 1 #define WAIT_PAGE_LOCK 2 -int read_extent_buffer_pages(struct extent_io_tree *tree, - struct extent_buffer *eb, int wait, +int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num); void wait_on_extent_buffer_writeback(struct extent_buffer *eb); @@ -487,8 +513,7 @@ int clean_io_failure(struct btrfs_fs_info *fs_info, struct extent_io_tree *io_tree, u64 start, struct page *page, u64 ino, unsigned int pg_offset); void end_extent_writepage(struct page *page, int err, u64 start, u64 end); -int repair_eb_io_failure(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int mirror_num); +int btrfs_repair_eb_io_failure(struct extent_buffer *eb, int mirror_num); /* * When IO fails, either with EIO or csum verification fails, we diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 928f729c55ba..9558d79faf1e 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -4,6 +4,7 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include "ctree.h" +#include "volumes.h" #include "extent_map.h" #include "compression.h" @@ -337,6 +338,37 @@ static inline void setup_extent_mapping(struct extent_map_tree *tree, try_merge_map(tree, em); } +static void extent_map_device_set_bits(struct extent_map *em, unsigned bits) +{ + struct map_lookup *map = em->map_lookup; + u64 stripe_size = em->orig_block_len; + int i; + + for (i = 0; i < map->num_stripes; i++) { + struct btrfs_bio_stripe *stripe = &map->stripes[i]; + struct btrfs_device *device = stripe->dev; + + set_extent_bits_nowait(&device->alloc_state, stripe->physical, + stripe->physical + stripe_size - 1, bits); + } +} + +static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits) +{ + struct map_lookup *map = em->map_lookup; + u64 stripe_size = em->orig_block_len; + int i; + + for (i = 0; i < map->num_stripes; i++) { + struct btrfs_bio_stripe *stripe = &map->stripes[i]; + struct btrfs_device *device = stripe->dev; + + __clear_extent_bit(&device->alloc_state, stripe->physical, + stripe->physical + stripe_size - 1, bits, + 0, 0, NULL, GFP_NOWAIT, NULL); + } +} + /** * add_extent_mapping - add new extent map to the extent tree * @tree: tree to insert new map in @@ -357,6 +389,10 @@ int add_extent_mapping(struct extent_map_tree *tree, goto out; setup_extent_mapping(tree, em, modified); + if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) { + extent_map_device_set_bits(em, CHUNK_ALLOCATED); + extent_map_device_clear_bits(em, CHUNK_TRIMMED); + } out: return ret; } @@ -438,6 +474,8 @@ void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) rb_erase_cached(&em->rb_node, &tree->map); if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) list_del_init(&em->list); + if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) + extent_map_device_clear_bits(em, CHUNK_ALLOCATED); RB_CLEAR_NODE(&em->rb_node); } diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 920bf3b4b0ef..d431ea8198e4 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -7,6 +7,7 @@ #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/highmem.h> +#include <linux/sched/mm.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -412,6 +413,16 @@ fail: return ret; } +/* + * btrfs_csum_one_bio - Calculates checksums of the data contained inside a bio + * @inode: Owner of the data inside the bio + * @bio: Contains the data to be checksummed + * @file_start: offset in file this bio begins to describe + * @contig: Boolean. If true/1 means all bio vecs in this bio are + * contiguous and they begin at @file_start in the file. False/0 + * means this bio can contains potentially discontigous bio vecs + * so the logical offset of each should be calculated separately. + */ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, u64 file_start, int contig) { @@ -427,9 +438,13 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, unsigned long this_sum_bytes = 0; int i; u64 offset; + unsigned nofs_flag; + + nofs_flag = memalloc_nofs_save(); + sums = kvzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size), + GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); - sums = kzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size), - GFP_NOFS); if (!sums) return BLK_STS_RESOURCE; @@ -453,8 +468,6 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, BUG_ON(!ordered); /* Logic error */ } - data = kmap_atomic(bvec.bv_page); - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len + fs_info->sectorsize - 1); @@ -464,16 +477,17 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, offset < ordered->file_offset) { unsigned long bytes_left; - kunmap_atomic(data); sums->len = this_sum_bytes; this_sum_bytes = 0; - btrfs_add_ordered_sum(inode, ordered, sums); + btrfs_add_ordered_sum(ordered, sums); btrfs_put_ordered_extent(ordered); bytes_left = bio->bi_iter.bi_size - total_bytes; - sums = kzalloc(btrfs_ordered_sum_size(fs_info, bytes_left), - GFP_NOFS); + nofs_flag = memalloc_nofs_save(); + sums = kvzalloc(btrfs_ordered_sum_size(fs_info, + bytes_left), GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); BUG_ON(!sums); /* -ENOMEM */ sums->len = bytes_left; ordered = btrfs_lookup_ordered_extent(inode, @@ -482,16 +496,16 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9) + total_bytes; index = 0; - - data = kmap_atomic(bvec.bv_page); } sums->sums[index] = ~(u32)0; + data = kmap_atomic(bvec.bv_page); sums->sums[index] = btrfs_csum_data(data + bvec.bv_offset + (i * fs_info->sectorsize), sums->sums[index], fs_info->sectorsize); + kunmap_atomic(data); btrfs_csum_final(sums->sums[index], (char *)(sums->sums + index)); index++; @@ -500,10 +514,9 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, total_bytes += fs_info->sectorsize; } - kunmap_atomic(data); } this_sum_bytes = 0; - btrfs_add_ordered_sum(inode, ordered, sums); + btrfs_add_ordered_sum(ordered, sums); btrfs_put_ordered_extent(ordered); return 0; } @@ -544,7 +557,7 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, */ u32 new_size = (bytenr - key->offset) >> blocksize_bits; new_size *= csum_size; - btrfs_truncate_item(fs_info, path, new_size, 1); + btrfs_truncate_item(path, new_size, 1); } else if (key->offset >= bytenr && csum_end > end_byte && end_byte > key->offset) { /* @@ -556,7 +569,7 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, u32 new_size = (csum_end - end_byte) >> blocksize_bits; new_size *= csum_size; - btrfs_truncate_item(fs_info, path, new_size, 0); + btrfs_truncate_item(path, new_size, 0); key->offset = end_byte; btrfs_set_item_key_safe(fs_info, path, key); @@ -825,11 +838,11 @@ again: u32 diff; u32 free_space; - if (btrfs_leaf_free_space(fs_info, leaf) < + if (btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item) + csum_size * 2) goto insert; - free_space = btrfs_leaf_free_space(fs_info, leaf) - + free_space = btrfs_leaf_free_space(leaf) - sizeof(struct btrfs_item) - csum_size; tmp = sums->len - total_bytes; tmp >>= fs_info->sb->s_blocksize_bits; @@ -845,7 +858,7 @@ again: diff /= csum_size; diff *= csum_size; - btrfs_extend_item(fs_info, path, diff); + btrfs_extend_item(path, diff); ret = 0; goto csum; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 34fe8a58b0e9..7e85dca0e6f2 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -754,6 +754,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; + struct btrfs_ref ref = { 0 }; struct btrfs_key key; struct btrfs_key new_key; u64 ino = btrfs_ino(BTRFS_I(inode)); @@ -909,11 +910,14 @@ next_slot: btrfs_mark_buffer_dirty(leaf); if (update_refs && disk_bytenr > 0) { - ret = btrfs_inc_extent_ref(trans, root, - disk_bytenr, num_bytes, 0, + btrfs_init_generic_ref(&ref, + BTRFS_ADD_DELAYED_REF, + disk_bytenr, num_bytes, 0); + btrfs_init_data_ref(&ref, root->root_key.objectid, new_key.objectid, start - extent_offset); + ret = btrfs_inc_extent_ref(trans, &ref); BUG_ON(ret); /* -ENOMEM */ } key.offset = start; @@ -993,11 +997,14 @@ delete_extent_item: extent_end = ALIGN(extent_end, fs_info->sectorsize); } else if (update_refs && disk_bytenr > 0) { - ret = btrfs_free_extent(trans, root, - disk_bytenr, num_bytes, 0, + btrfs_init_generic_ref(&ref, + BTRFS_DROP_DELAYED_REF, + disk_bytenr, num_bytes, 0); + btrfs_init_data_ref(&ref, root->root_key.objectid, - key.objectid, key.offset - - extent_offset); + key.objectid, + key.offset - extent_offset); + ret = btrfs_free_extent(trans, &ref); BUG_ON(ret); /* -ENOMEM */ inode_sub_bytes(inode, extent_end - key.offset); @@ -1025,7 +1032,7 @@ delete_extent_item: continue; } - BUG_ON(1); + BUG(); } if (!ret && del_nr > 0) { @@ -1050,7 +1057,7 @@ delete_extent_item: if (!ret && replace_extent && leafs_visited == 1 && (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING || path->locks[0] == BTRFS_WRITE_LOCK) && - btrfs_leaf_free_space(fs_info, leaf) >= + btrfs_leaf_free_space(leaf) >= sizeof(struct btrfs_item) + extent_item_size) { key.objectid = ino; @@ -1142,6 +1149,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_path *path; struct btrfs_file_extent_item *fi; + struct btrfs_ref ref = { 0 }; struct btrfs_key key; struct btrfs_key new_key; u64 bytenr; @@ -1287,9 +1295,11 @@ again: extent_end - split); btrfs_mark_buffer_dirty(leaf); - ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, - 0, root->root_key.objectid, - ino, orig_offset); + btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr, + num_bytes, 0); + btrfs_init_data_ref(&ref, root->root_key.objectid, ino, + orig_offset); + ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -1311,6 +1321,9 @@ again: other_start = end; other_end = 0; + btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, + num_bytes, 0); + btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset); if (extent_mergeable(leaf, path->slots[0] + 1, ino, bytenr, orig_offset, &other_start, &other_end)) { @@ -1321,9 +1334,7 @@ again: extent_end = other_end; del_slot = path->slots[0] + 1; del_nr++; - ret = btrfs_free_extent(trans, root, bytenr, num_bytes, - 0, root->root_key.objectid, - ino, orig_offset); + ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -1341,9 +1352,7 @@ again: key.offset = other_start; del_slot = path->slots[0]; del_nr++; - ret = btrfs_free_extent(trans, root, bytenr, num_bytes, - 0, root->root_key.objectid, - ino, orig_offset); + ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -2165,7 +2174,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) inode_unlock(inode); goto out; } - trans->sync = true; ret = btrfs_log_dentry_safe(trans, dentry, start, end, &ctx); if (ret < 0) { @@ -3132,6 +3140,7 @@ static long btrfs_fallocate(struct file *file, int mode, ret = btrfs_qgroup_reserve_data(inode, &data_reserved, cur_offset, last_byte - cur_offset); if (ret < 0) { + cur_offset = last_byte; free_extent_map(em); break; } @@ -3181,7 +3190,7 @@ out: /* Let go of our reservation. */ if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE)) btrfs_free_reserved_data_space(inode, data_reserved, - alloc_start, alloc_end - cur_offset); + cur_offset, alloc_end - cur_offset); extent_changeset_free(data_reserved); return ret; } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 74aa552f4793..f74dc259307b 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -88,10 +88,11 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, return inode; } -struct inode *lookup_free_space_inode(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache - *block_group, struct btrfs_path *path) +struct inode *lookup_free_space_inode( + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path) { + struct btrfs_fs_info *fs_info = block_group->fs_info; struct inode *inode = NULL; u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; @@ -185,20 +186,19 @@ static int __create_free_space_inode(struct btrfs_root *root, return 0; } -int create_free_space_inode(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, +int create_free_space_inode(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path) { int ret; u64 ino; - ret = btrfs_find_free_objectid(fs_info->tree_root, &ino); + ret = btrfs_find_free_objectid(trans->fs_info->tree_root, &ino); if (ret < 0) return ret; - return __create_free_space_inode(fs_info->tree_root, trans, path, ino, - block_group->key.objectid); + return __create_free_space_inode(trans->fs_info->tree_root, trans, path, + ino, block_group->key.objectid); } int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, @@ -812,9 +812,9 @@ free_cache: goto out; } -int load_free_space_cache(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *block_group) +int load_free_space_cache(struct btrfs_block_group_cache *block_group) { + struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct inode *inode; struct btrfs_path *path; @@ -858,7 +858,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, * once created get their ->cached field set to BTRFS_CACHE_FINISHED so * we will never try to read their inode item while the fs is mounted. */ - inode = lookup_free_space_inode(fs_info, block_group, path); + inode = lookup_free_space_inode(block_group, path); if (IS_ERR(inode)) { btrfs_free_path(path); return 0; @@ -1039,8 +1039,7 @@ fail: return -1; } -static noinline_for_stack int -write_pinned_extent_entries(struct btrfs_fs_info *fs_info, +static noinline_for_stack int write_pinned_extent_entries( struct btrfs_block_group_cache *block_group, struct btrfs_io_ctl *io_ctl, int *entries) @@ -1059,7 +1058,7 @@ write_pinned_extent_entries(struct btrfs_fs_info *fs_info, * We shouldn't have switched the pinned extents yet so this is the * right one */ - unpin = fs_info->pinned_extents; + unpin = block_group->fs_info->pinned_extents; start = block_group->key.objectid; @@ -1235,7 +1234,6 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_io_ctl *io_ctl, struct btrfs_trans_handle *trans) { - struct btrfs_fs_info *fs_info = root->fs_info; struct extent_state *cached_state = NULL; LIST_HEAD(bitmap_list); int entries = 0; @@ -1293,8 +1291,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, * If this changes while we are working we'll get added back to * the dirty list and redo it. No locking needed */ - ret = write_pinned_extent_entries(fs_info, block_group, - io_ctl, &entries); + ret = write_pinned_extent_entries(block_group, io_ctl, &entries); if (ret) goto out_nospc_locked; @@ -1370,11 +1367,11 @@ out_unlock: goto out; } -int btrfs_write_out_cache(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, +int btrfs_write_out_cache(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct inode *inode; int ret = 0; @@ -1386,7 +1383,7 @@ int btrfs_write_out_cache(struct btrfs_fs_info *fs_info, } spin_unlock(&block_group->lock); - inode = lookup_free_space_inode(fs_info, block_group, path); + inode = lookup_free_space_inode(block_group, path); if (IS_ERR(inode)) return 0; @@ -3040,11 +3037,11 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, * returns zero and sets up cluster if things worked out, otherwise * it returns -enospc */ -int btrfs_find_space_cluster(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *block_group, +int btrfs_find_space_cluster(struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, u64 offset, u64 bytes, u64 empty_size) { + struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry, *tmp; LIST_HEAD(bitmaps); @@ -3366,10 +3363,6 @@ void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group) em = lookup_extent_mapping(em_tree, block_group->key.objectid, 1); BUG_ON(!em); /* logic error, can't happen */ - /* - * remove_extent_mapping() will delete us from the pinned_chunks - * list, which is protected by the chunk mutex. - */ remove_extent_mapping(em_tree, em); write_unlock(&em_tree->lock); mutex_unlock(&fs_info->chunk_mutex); diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 15e30b93db0d..8760acb55ffd 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -38,11 +38,10 @@ struct btrfs_free_space_op { struct btrfs_io_ctl; -struct inode *lookup_free_space_inode(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache - *block_group, struct btrfs_path *path); -int create_free_space_inode(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, +struct inode *lookup_free_space_inode( + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path); +int create_free_space_inode(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path); @@ -51,13 +50,11 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct inode *inode); -int load_free_space_cache(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *block_group); +int load_free_space_cache(struct btrfs_block_group_cache *block_group); int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path); -int btrfs_write_out_cache(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, +int btrfs_write_out_cache(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path); struct inode *lookup_free_ino_inode(struct btrfs_root *root, @@ -95,8 +92,7 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root); void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, u64 bytes); -int btrfs_find_space_cluster(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *block_group, +int btrfs_find_space_cluster(struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, u64 offset, u64 bytes, u64 empty_size); void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster); diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index e5089087eaa6..f5dc115ebba0 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -76,10 +76,11 @@ out: EXPORT_FOR_TESTS struct btrfs_free_space_info *search_free_space_info( - struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path, int cow) { + struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *root = fs_info->free_space_root; struct btrfs_key key; int ret; @@ -253,7 +254,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, btrfs_release_path(path); } - info = search_free_space_info(trans, fs_info, block_group, path, 1); + info = search_free_space_info(trans, block_group, path, 1); if (IS_ERR(info)) { ret = PTR_ERR(info); goto out; @@ -398,7 +399,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, btrfs_release_path(path); } - info = search_free_space_info(trans, fs_info, block_group, path, 1); + info = search_free_space_info(trans, block_group, path, 1); if (IS_ERR(info)) { ret = PTR_ERR(info); goto out; @@ -463,8 +464,7 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans, if (new_extents == 0) return 0; - info = search_free_space_info(trans, trans->fs_info, block_group, path, - 1); + info = search_free_space_info(trans, block_group, path, 1); if (IS_ERR(info)) { ret = PTR_ERR(info); goto out; @@ -793,8 +793,7 @@ int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, return ret; } - info = search_free_space_info(NULL, trans->fs_info, block_group, path, - 0); + info = search_free_space_info(NULL, block_group, path, 0); if (IS_ERR(info)) return PTR_ERR(info); flags = btrfs_free_space_flags(path->nodes[0], info); @@ -977,7 +976,6 @@ int __add_to_free_space_tree(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path, u64 start, u64 size) { - struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_free_space_info *info; u32 flags; int ret; @@ -988,7 +986,7 @@ int __add_to_free_space_tree(struct btrfs_trans_handle *trans, return ret; } - info = search_free_space_info(NULL, fs_info, block_group, path, 0); + info = search_free_space_info(NULL, block_group, path, 0); if (IS_ERR(info)) return PTR_ERR(info); flags = btrfs_free_space_flags(path->nodes[0], info); @@ -1150,7 +1148,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) return PTR_ERR(trans); set_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); - free_space_root = btrfs_create_tree(trans, fs_info, + free_space_root = btrfs_create_tree(trans, BTRFS_FREE_SPACE_TREE_OBJECTID); if (IS_ERR(free_space_root)) { ret = PTR_ERR(free_space_root); @@ -1248,7 +1246,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) list_del(&free_space_root->dirty_list); btrfs_tree_lock(free_space_root->node); - clean_tree_block(fs_info, free_space_root->node); + btrfs_clean_tree_block(free_space_root->node); btrfs_tree_unlock(free_space_root->node); btrfs_free_tree_block(trans, free_space_root, free_space_root->node, 0, 1); @@ -1534,14 +1532,12 @@ out: int load_free_space_tree(struct btrfs_caching_control *caching_ctl) { struct btrfs_block_group_cache *block_group; - struct btrfs_fs_info *fs_info; struct btrfs_free_space_info *info; struct btrfs_path *path; u32 extent_count, flags; int ret; block_group = caching_ctl->block_group; - fs_info = block_group->fs_info; path = btrfs_alloc_path(); if (!path) @@ -1555,7 +1551,7 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl) path->search_commit_root = 1; path->reada = READA_FORWARD; - info = search_free_space_info(NULL, fs_info, block_group, path, 0); + info = search_free_space_info(NULL, block_group, path, 0); if (IS_ERR(info)) { ret = PTR_ERR(info); goto out; diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h index 3133651d7d70..22b7602bde25 100644 --- a/fs/btrfs/free-space-tree.h +++ b/fs/btrfs/free-space-tree.h @@ -30,7 +30,6 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans, #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_free_space_info * search_free_space_info(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_path *path, int cow); int __add_to_free_space_tree(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index a8956a3c9e05..30d62ef918b9 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -170,7 +170,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, memmove_extent_buffer(leaf, ptr, ptr + del_len, item_size - (ptr + del_len - item_start)); - btrfs_truncate_item(root->fs_info, path, item_size - del_len, 1); + btrfs_truncate_item(path, item_size - del_len, 1); out: btrfs_free_path(path); @@ -234,7 +234,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_size - (ptr + sub_item_len - item_start)); - btrfs_truncate_item(root->fs_info, path, item_size - sub_item_len, 1); + btrfs_truncate_item(path, item_size - sub_item_len, 1); out: btrfs_free_path(path); @@ -288,7 +288,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, name, name_len, NULL)) goto out; - btrfs_extend_item(root->fs_info, path, ins_len); + btrfs_extend_item(path, ins_len); ret = 0; } if (ret < 0) @@ -347,7 +347,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, goto out; old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); - btrfs_extend_item(fs_info, path, ins_len); + btrfs_extend_item(path, ins_len); ref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 82fdda8ff5ab..9aba9660efe5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -28,6 +28,7 @@ #include <linux/magic.h> #include <linux/iversion.h> #include <linux/swap.h> +#include <linux/sched/mm.h> #include <asm/unaligned.h> #include "ctree.h" #include "disk-io.h" @@ -73,17 +74,6 @@ struct kmem_cache *btrfs_trans_handle_cachep; struct kmem_cache *btrfs_path_cachep; struct kmem_cache *btrfs_free_space_cachep; -#define S_SHIFT 12 -static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { - [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE, - [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR, - [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV, - [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV, - [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO, - [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK, - [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, -}; - static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct inode *inode, bool skip_writeback); static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); @@ -366,18 +356,24 @@ struct async_extent { struct list_head list; }; -struct async_cow { +struct async_chunk { struct inode *inode; - struct btrfs_fs_info *fs_info; struct page *locked_page; u64 start; u64 end; unsigned int write_flags; struct list_head extents; struct btrfs_work work; + atomic_t *pending; +}; + +struct async_cow { + /* Number of chunks in flight; must be first in the structure */ + atomic_t num_chunks; + struct async_chunk chunks[]; }; -static noinline int add_async_extent(struct async_cow *cow, +static noinline int add_async_extent(struct async_chunk *cow, u64 start, u64 ram_size, u64 compressed_size, struct page **pages, @@ -444,14 +440,14 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, * are written in the same order that the flusher thread sent them * down. */ -static noinline void compress_file_range(struct inode *inode, - struct page *locked_page, - u64 start, u64 end, - struct async_cow *async_cow, - int *num_added) +static noinline void compress_file_range(struct async_chunk *async_chunk, + int *num_added) { + struct inode *inode = async_chunk->inode; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 blocksize = fs_info->sectorsize; + u64 start = async_chunk->start; + u64 end = async_chunk->end; u64 actual_end; int ret = 0; struct page **pages = NULL; @@ -630,7 +626,7 @@ cont: * allocation on disk for these compressed pages, and * will submit them to the elevator. */ - add_async_extent(async_cow, start, total_in, + add_async_extent(async_chunk, start, total_in, total_compressed, pages, nr_pages, compress_type); @@ -670,14 +666,14 @@ cleanup_and_bail_uncompressed: * to our extent and set things up for the async work queue to run * cow_file_range to do the normal delalloc dance. */ - if (page_offset(locked_page) >= start && - page_offset(locked_page) <= end) - __set_page_dirty_nobuffers(locked_page); + if (page_offset(async_chunk->locked_page) >= start && + page_offset(async_chunk->locked_page) <= end) + __set_page_dirty_nobuffers(async_chunk->locked_page); /* unlocked later on in the async handlers */ if (redirty) extent_range_redirty_for_io(inode, start, end); - add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0, + add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, BTRFS_COMPRESS_NONE); *num_added += 1; @@ -713,38 +709,34 @@ static void free_async_extent_pages(struct async_extent *async_extent) * queued. We walk all the async extents created by compress_file_range * and send them down to the disk. */ -static noinline void submit_compressed_extents(struct async_cow *async_cow) +static noinline void submit_compressed_extents(struct async_chunk *async_chunk) { - struct inode *inode = async_cow->inode; + struct inode *inode = async_chunk->inode; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct async_extent *async_extent; u64 alloc_hint = 0; struct btrfs_key ins; struct extent_map *em; struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_io_tree *io_tree; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; int ret = 0; again: - while (!list_empty(&async_cow->extents)) { - async_extent = list_entry(async_cow->extents.next, + while (!list_empty(&async_chunk->extents)) { + async_extent = list_entry(async_chunk->extents.next, struct async_extent, list); list_del(&async_extent->list); - io_tree = &BTRFS_I(inode)->io_tree; - retry: + lock_extent(io_tree, async_extent->start, + async_extent->start + async_extent->ram_size - 1); /* did the compression code fall back to uncompressed IO? */ if (!async_extent->pages) { int page_started = 0; unsigned long nr_written = 0; - lock_extent(io_tree, async_extent->start, - async_extent->start + - async_extent->ram_size - 1); - /* allocate blocks */ - ret = cow_file_range(inode, async_cow->locked_page, + ret = cow_file_range(inode, async_chunk->locked_page, async_extent->start, async_extent->start + async_extent->ram_size - 1, @@ -768,15 +760,12 @@ retry: async_extent->ram_size - 1, WB_SYNC_ALL); else if (ret) - unlock_page(async_cow->locked_page); + unlock_page(async_chunk->locked_page); kfree(async_extent); cond_resched(); continue; } - lock_extent(io_tree, async_extent->start, - async_extent->start + async_extent->ram_size - 1); - ret = btrfs_reserve_extent(root, async_extent->ram_size, async_extent->compressed_size, async_extent->compressed_size, @@ -855,7 +844,7 @@ retry: ins.objectid, ins.offset, async_extent->pages, async_extent->nr_pages, - async_cow->write_flags)) { + async_chunk->write_flags)) { struct page *p = async_extent->pages[0]; const u64 start = async_extent->start; const u64 end = start + async_extent->ram_size - 1; @@ -1132,16 +1121,15 @@ out_unlock: */ static noinline void async_cow_start(struct btrfs_work *work) { - struct async_cow *async_cow; + struct async_chunk *async_chunk; int num_added = 0; - async_cow = container_of(work, struct async_cow, work); - compress_file_range(async_cow->inode, async_cow->locked_page, - async_cow->start, async_cow->end, async_cow, - &num_added); + async_chunk = container_of(work, struct async_chunk, work); + + compress_file_range(async_chunk, &num_added); if (num_added == 0) { - btrfs_add_delayed_iput(async_cow->inode); - async_cow->inode = NULL; + btrfs_add_delayed_iput(async_chunk->inode); + async_chunk->inode = NULL; } } @@ -1150,14 +1138,12 @@ static noinline void async_cow_start(struct btrfs_work *work) */ static noinline void async_cow_submit(struct btrfs_work *work) { - struct btrfs_fs_info *fs_info; - struct async_cow *async_cow; + struct async_chunk *async_chunk = container_of(work, struct async_chunk, + work); + struct btrfs_fs_info *fs_info = btrfs_work_owner(work); unsigned long nr_pages; - async_cow = container_of(work, struct async_cow, work); - - fs_info = async_cow->fs_info; - nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >> + nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >> PAGE_SHIFT; /* atomic_sub_return implies a barrier */ @@ -1166,22 +1152,28 @@ static noinline void async_cow_submit(struct btrfs_work *work) cond_wake_up_nomb(&fs_info->async_submit_wait); /* - * ->inode could be NULL if async_cow_start has failed to compress, + * ->inode could be NULL if async_chunk_start has failed to compress, * in which case we don't have anything to submit, yet we need to * always adjust ->async_delalloc_pages as its paired with the init * happening in cow_file_range_async */ - if (async_cow->inode) - submit_compressed_extents(async_cow); + if (async_chunk->inode) + submit_compressed_extents(async_chunk); } static noinline void async_cow_free(struct btrfs_work *work) { - struct async_cow *async_cow; - async_cow = container_of(work, struct async_cow, work); - if (async_cow->inode) - btrfs_add_delayed_iput(async_cow->inode); - kfree(async_cow); + struct async_chunk *async_chunk; + + async_chunk = container_of(work, struct async_chunk, work); + if (async_chunk->inode) + btrfs_add_delayed_iput(async_chunk->inode); + /* + * Since the pointer to 'pending' is at the beginning of the array of + * async_chunk's, freeing it ensures the whole array has been freed. + */ + if (atomic_dec_and_test(async_chunk->pending)) + kvfree(async_chunk->pending); } static int cow_file_range_async(struct inode *inode, struct page *locked_page, @@ -1190,45 +1182,73 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, unsigned int write_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct async_cow *async_cow; + struct async_cow *ctx; + struct async_chunk *async_chunk; unsigned long nr_pages; u64 cur_end; + u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K); + int i; + bool should_compress; + unsigned nofs_flag; + + unlock_extent(&BTRFS_I(inode)->io_tree, start, end); + + if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS && + !btrfs_test_opt(fs_info, FORCE_COMPRESS)) { + num_chunks = 1; + should_compress = false; + } else { + should_compress = true; + } + + nofs_flag = memalloc_nofs_save(); + ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); + + if (!ctx) { + unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | + EXTENT_DO_ACCOUNTING; + unsigned long page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | + PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK | + PAGE_SET_ERROR; + + extent_clear_unlock_delalloc(inode, start, end, 0, locked_page, + clear_bits, page_ops); + return -ENOMEM; + } + + async_chunk = ctx->chunks; + atomic_set(&ctx->num_chunks, num_chunks); + + for (i = 0; i < num_chunks; i++) { + if (should_compress) + cur_end = min(end, start + SZ_512K - 1); + else + cur_end = end; - clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, - 1, 0, NULL); - while (start < end) { - async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); - BUG_ON(!async_cow); /* -ENOMEM */ /* * igrab is called higher up in the call chain, take only the * lightweight reference for the callback lifetime */ ihold(inode); - async_cow->inode = inode; - async_cow->fs_info = fs_info; - async_cow->locked_page = locked_page; - async_cow->start = start; - async_cow->write_flags = write_flags; - - if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS && - !btrfs_test_opt(fs_info, FORCE_COMPRESS)) - cur_end = end; - else - cur_end = min(end, start + SZ_512K - 1); - - async_cow->end = cur_end; - INIT_LIST_HEAD(&async_cow->extents); - - btrfs_init_work(&async_cow->work, + async_chunk[i].pending = &ctx->num_chunks; + async_chunk[i].inode = inode; + async_chunk[i].start = start; + async_chunk[i].end = cur_end; + async_chunk[i].locked_page = locked_page; + async_chunk[i].write_flags = write_flags; + INIT_LIST_HEAD(&async_chunk[i].extents); + + btrfs_init_work(&async_chunk[i].work, btrfs_delalloc_helper, async_cow_start, async_cow_submit, async_cow_free); - nr_pages = (cur_end - start + PAGE_SIZE) >> - PAGE_SHIFT; + nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE); atomic_add(nr_pages, &fs_info->async_delalloc_pages); - btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work); + btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work); *nr_written += nr_pages; start = cur_end + 1; @@ -1451,7 +1471,7 @@ next_slot: extent_end = ALIGN(extent_end, fs_info->sectorsize); } else { - BUG_ON(1); + BUG(); } out_check: if (extent_end <= start) { @@ -1964,11 +1984,11 @@ static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio, * * c-3) otherwise: async submit */ -static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, + int mirror_num, + unsigned long bio_flags) + { - struct inode *inode = private_data; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA; @@ -2003,8 +2023,7 @@ static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio, goto mapit; /* we're doing a write, do the async checksumming */ ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags, - bio_offset, inode, - btrfs_submit_bio_start); + 0, inode, btrfs_submit_bio_start); goto out; } else if (!skip_sum) { ret = btrfs_csum_one_bio(inode, bio, 0, 0); @@ -2531,6 +2550,7 @@ static noinline int relink_extent_backref(struct btrfs_path *path, struct btrfs_file_extent_item *item; struct btrfs_ordered_extent *ordered; struct btrfs_trans_handle *trans; + struct btrfs_ref ref = { 0 }; struct btrfs_root *root; struct btrfs_key key; struct extent_buffer *leaf; @@ -2701,10 +2721,11 @@ again: inode_add_bytes(inode, len); btrfs_release_path(path); - ret = btrfs_inc_extent_ref(trans, root, new->bytenr, - new->disk_len, 0, - backref->root_id, backref->inum, - new->file_pos); /* start - extent_offset */ + btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new->bytenr, + new->disk_len, 0); + btrfs_init_data_ref(&ref, backref->root_id, backref->inum, + new->file_pos); /* start - extent_offset */ + ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); goto out_free_path; @@ -3699,21 +3720,6 @@ cache_index: * inode is not a directory, logging its parent unnecessarily. */ BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; - /* - * Similar reasoning for last_link_trans, needs to be set otherwise - * for a case like the following: - * - * mkdir A - * touch foo - * ln foo A/bar - * echo 2 > /proc/sys/vm/drop_caches - * fsync foo - * <power failure> - * - * Would result in link bar and directory A not existing after the power - * failure. - */ - BTRFS_I(inode)->last_link_trans = BTRFS_I(inode)->last_trans; path->slots[0]++; if (inode->i_nlink != 1 || @@ -4679,7 +4685,7 @@ search_again: btrfs_set_file_extent_ram_bytes(leaf, fi, size); size = btrfs_file_extent_calc_inline_size(size); - btrfs_truncate_item(root->fs_info, path, size, 1); + btrfs_truncate_item(path, size, 1); } else if (!del_item) { /* * We have to bail so the last_size is set to @@ -4718,12 +4724,17 @@ delete: if (found_extent && (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || root == fs_info->tree_root)) { + struct btrfs_ref ref = { 0 }; + btrfs_set_path_blocking(path); bytes_deleted += extent_num_bytes; - ret = btrfs_free_extent(trans, root, extent_start, - extent_num_bytes, 0, - btrfs_header_owner(leaf), - ino, extent_offset); + + btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, + extent_start, extent_num_bytes, 0); + ref.real_root = root->root_key.objectid; + btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), + ino, extent_offset); + ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); break; @@ -5448,12 +5459,14 @@ no_delete: } /* - * this returns the key found in the dir entry in the location pointer. + * Return the key found in the dir entry in the location pointer, fill @type + * with BTRFS_FT_*, and return 0. + * * If no dir entries were found, returns -ENOENT. * If found a corrupted location in dir entry, returns -EUCLEAN. */ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, - struct btrfs_key *location) + struct btrfs_key *location, u8 *type) { const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; @@ -5482,6 +5495,8 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, __func__, name, btrfs_ino(BTRFS_I(dir)), location->objectid, location->type, location->offset); } + if (!ret) + *type = btrfs_dir_type(path->nodes[0], di); out: btrfs_free_path(path); return ret; @@ -5719,6 +5734,24 @@ static struct inode *new_simple_dir(struct super_block *s, return inode; } +static inline u8 btrfs_inode_type(struct inode *inode) +{ + /* + * Compile-time asserts that generic FT_* types still match + * BTRFS_FT_* types + */ + BUILD_BUG_ON(BTRFS_FT_UNKNOWN != FT_UNKNOWN); + BUILD_BUG_ON(BTRFS_FT_REG_FILE != FT_REG_FILE); + BUILD_BUG_ON(BTRFS_FT_DIR != FT_DIR); + BUILD_BUG_ON(BTRFS_FT_CHRDEV != FT_CHRDEV); + BUILD_BUG_ON(BTRFS_FT_BLKDEV != FT_BLKDEV); + BUILD_BUG_ON(BTRFS_FT_FIFO != FT_FIFO); + BUILD_BUG_ON(BTRFS_FT_SOCK != FT_SOCK); + BUILD_BUG_ON(BTRFS_FT_SYMLINK != FT_SYMLINK); + + return fs_umode_to_ftype(inode->i_mode); +} + struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); @@ -5726,18 +5759,31 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *sub_root = root; struct btrfs_key location; + u8 di_type = 0; int index; int ret = 0; if (dentry->d_name.len > BTRFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - ret = btrfs_inode_by_name(dir, dentry, &location); + ret = btrfs_inode_by_name(dir, dentry, &location, &di_type); if (ret < 0) return ERR_PTR(ret); if (location.type == BTRFS_INODE_ITEM_KEY) { inode = btrfs_iget(dir->i_sb, &location, root, NULL); + if (IS_ERR(inode)) + return inode; + + /* Do extra check against inode mode with di_type */ + if (btrfs_inode_type(inode) != di_type) { + btrfs_crit(fs_info, +"inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u", + inode->i_mode, btrfs_inode_type(inode), + di_type); + iput(inode); + return ERR_PTR(-EUCLEAN); + } return inode; } @@ -5797,10 +5843,6 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, return d_splice_alias(inode, dentry); } -unsigned char btrfs_filetype_table[] = { - DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK -}; - /* * All this infrastructure exists because dir_emit can fault, and we are holding * the tree lock when doing readdir. For now just allocate a buffer and copy @@ -5939,7 +5981,7 @@ again: name_ptr = (char *)(entry + 1); read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1), name_len); - put_unaligned(btrfs_filetype_table[btrfs_dir_type(leaf, di)], + put_unaligned(fs_ftype_to_dtype(btrfs_dir_type(leaf, di)), &entry->type); btrfs_dir_item_key_to_cpu(leaf, di, &location); put_unaligned(location.objectid, &entry->ino); @@ -6342,11 +6384,6 @@ fail: return ERR_PTR(ret); } -static inline u8 btrfs_inode_type(struct inode *inode) -{ - return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT]; -} - /* * utility function to add 'inode' into 'parent_inode' with * a give name and a given sequence number. @@ -6634,7 +6671,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (err) goto fail; } - BTRFS_I(inode)->last_link_trans = trans->transid; d_instantiate(dentry, inode); ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent, true, NULL); @@ -6783,7 +6819,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, u64 extent_start = 0; u64 extent_end = 0; u64 objectid = btrfs_ino(inode); - u8 extent_type; + int extent_type = -1; struct btrfs_path *path = NULL; struct btrfs_root *root = inode->root; struct btrfs_file_extent_item *item; @@ -6864,6 +6900,14 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, extent_start = found_key.offset; if (extent_type == BTRFS_FILE_EXTENT_REG || extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + /* Only regular file could have regular/prealloc extent */ + if (!S_ISREG(inode->vfs_inode.i_mode)) { + ret = -EUCLEAN; + btrfs_crit(fs_info, + "regular/prealloc extent found for non-regular inode %llu", + btrfs_ino(inode)); + goto out; + } extent_end = extent_start + btrfs_file_extent_num_bytes(leaf, item); @@ -7828,7 +7872,6 @@ static void btrfs_retry_endio_nocsum(struct bio *bio) struct inode *inode = done->inode; struct bio_vec *bvec; struct extent_io_tree *io_tree, *failure_tree; - int i; struct bvec_iter_all iter_all; if (bio->bi_status) @@ -7841,7 +7884,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio) done->uptodate = 1; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, i, iter_all) + bio_for_each_segment_all(bvec, bio, iter_all) clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree, io_tree, done->start, bvec->bv_page, btrfs_ino(BTRFS_I(inode)), 0); @@ -7919,7 +7962,7 @@ static void btrfs_retry_endio(struct bio *bio) struct bio_vec *bvec; int uptodate; int ret; - int i; + int i = 0; struct bvec_iter_all iter_all; if (bio->bi_status) @@ -7934,7 +7977,7 @@ static void btrfs_retry_endio(struct bio *bio) failure_tree = &BTRFS_I(inode)->io_failure_tree; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, i, iter_all) { + bio_for_each_segment_all(bvec, bio, iter_all) { ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page, bvec->bv_offset, done->start, bvec->bv_len); @@ -7946,6 +7989,7 @@ static void btrfs_retry_endio(struct bio *bio) bvec->bv_offset); else uptodate = 0; + i++; } done->uptodate = uptodate; @@ -9163,7 +9207,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->index_cnt = (u64)-1; ei->dir_index = 0; ei->last_unlink_trans = 0; - ei->last_link_trans = 0; ei->last_log_commit = 0; spin_lock_init(&ei->lock); @@ -9182,10 +9225,11 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) inode = &ei->vfs_inode; extent_map_tree_init(&ei->extent_tree); - extent_io_tree_init(&ei->io_tree, inode); - extent_io_tree_init(&ei->io_failure_tree, inode); - ei->io_tree.track_uptodate = 1; - ei->io_failure_tree.track_uptodate = 1; + extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode); + extent_io_tree_init(fs_info, &ei->io_failure_tree, + IO_TREE_INODE_IO_FAILURE, inode); + ei->io_tree.track_uptodate = true; + ei->io_failure_tree.track_uptodate = true; atomic_set(&ei->sync_writers, 0); mutex_init(&ei->log_mutex); mutex_init(&ei->delalloc_mutex); @@ -9206,9 +9250,8 @@ void btrfs_test_destroy_inode(struct inode *inode) } #endif -static void btrfs_i_callback(struct rcu_head *head) +void btrfs_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } @@ -9234,7 +9277,7 @@ void btrfs_destroy_inode(struct inode *inode) * created. */ if (!root) - goto free; + return; while (1) { ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); @@ -9252,8 +9295,6 @@ void btrfs_destroy_inode(struct inode *inode) btrfs_qgroup_check_reserved_leak(inode); inode_tree_del(inode); btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); -free: - call_rcu(&inode->i_rcu, btrfs_i_callback); } int btrfs_drop_inode(struct inode *inode) @@ -9430,7 +9471,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* Reference for the source. */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { /* force full log commit if subvolume involved. */ - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); } else { btrfs_pin_log_trans(root); root_log_pinned = true; @@ -9447,7 +9488,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* And now for the dest. */ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { /* force full log commit if subvolume involved. */ - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); } else { btrfs_pin_log_trans(dest); dest_log_pinned = true; @@ -9583,7 +9624,7 @@ out_fail: btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) || (new_inode && btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))) - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); if (root_log_pinned) { btrfs_end_log_trans(root); @@ -9769,7 +9810,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, BTRFS_I(old_inode)->dir_index = 0ULL; if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { /* force full log commit if subvolume involved. */ - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); } else { btrfs_pin_log_trans(root); log_pinned = true; @@ -9890,7 +9931,7 @@ out_fail: btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) || (new_inode && btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))) - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); btrfs_end_log_trans(root); log_pinned = false; @@ -10193,7 +10234,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode->i_op = &btrfs_symlink_inode_operations; inode_nohighmem(inode); - inode->i_mapping->a_ops = &btrfs_aops; inode_set_bytes(inode, name_len); btrfs_i_size_write(BTRFS_I(inode), name_len); err = btrfs_update_inode(trans, root, inode); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ec2d8919e7fb..6dafa857bbb9 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -187,11 +187,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) struct btrfs_inode *binode = BTRFS_I(inode); struct btrfs_root *root = binode->root; struct btrfs_trans_handle *trans; - unsigned int fsflags, old_fsflags; + unsigned int fsflags; int ret; - u64 old_flags; - unsigned int old_i_flags; - umode_t mode; + const char *comp = NULL; + u32 binode_flags = binode->flags; if (!inode_owner_or_capable(inode)) return -EPERM; @@ -212,13 +211,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) inode_lock(inode); - old_flags = binode->flags; - old_i_flags = inode->i_flags; - mode = inode->i_mode; - fsflags = btrfs_mask_fsflags_for_type(inode, fsflags); - old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags); - if ((fsflags ^ old_fsflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { + if ((fsflags ^ btrfs_inode_flags_to_fsflags(binode->flags)) & + (FS_APPEND_FL | FS_IMMUTABLE_FL)) { if (!capable(CAP_LINUX_IMMUTABLE)) { ret = -EPERM; goto out_unlock; @@ -226,52 +221,52 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) } if (fsflags & FS_SYNC_FL) - binode->flags |= BTRFS_INODE_SYNC; + binode_flags |= BTRFS_INODE_SYNC; else - binode->flags &= ~BTRFS_INODE_SYNC; + binode_flags &= ~BTRFS_INODE_SYNC; if (fsflags & FS_IMMUTABLE_FL) - binode->flags |= BTRFS_INODE_IMMUTABLE; + binode_flags |= BTRFS_INODE_IMMUTABLE; else - binode->flags &= ~BTRFS_INODE_IMMUTABLE; + binode_flags &= ~BTRFS_INODE_IMMUTABLE; if (fsflags & FS_APPEND_FL) - binode->flags |= BTRFS_INODE_APPEND; + binode_flags |= BTRFS_INODE_APPEND; else - binode->flags &= ~BTRFS_INODE_APPEND; + binode_flags &= ~BTRFS_INODE_APPEND; if (fsflags & FS_NODUMP_FL) - binode->flags |= BTRFS_INODE_NODUMP; + binode_flags |= BTRFS_INODE_NODUMP; else - binode->flags &= ~BTRFS_INODE_NODUMP; + binode_flags &= ~BTRFS_INODE_NODUMP; if (fsflags & FS_NOATIME_FL) - binode->flags |= BTRFS_INODE_NOATIME; + binode_flags |= BTRFS_INODE_NOATIME; else - binode->flags &= ~BTRFS_INODE_NOATIME; + binode_flags &= ~BTRFS_INODE_NOATIME; if (fsflags & FS_DIRSYNC_FL) - binode->flags |= BTRFS_INODE_DIRSYNC; + binode_flags |= BTRFS_INODE_DIRSYNC; else - binode->flags &= ~BTRFS_INODE_DIRSYNC; + binode_flags &= ~BTRFS_INODE_DIRSYNC; if (fsflags & FS_NOCOW_FL) { - if (S_ISREG(mode)) { + if (S_ISREG(inode->i_mode)) { /* * It's safe to turn csums off here, no extents exist. * Otherwise we want the flag to reflect the real COW * status of the file and will not set it. */ if (inode->i_size == 0) - binode->flags |= BTRFS_INODE_NODATACOW - | BTRFS_INODE_NODATASUM; + binode_flags |= BTRFS_INODE_NODATACOW | + BTRFS_INODE_NODATASUM; } else { - binode->flags |= BTRFS_INODE_NODATACOW; + binode_flags |= BTRFS_INODE_NODATACOW; } } else { /* * Revert back under same assumptions as above */ - if (S_ISREG(mode)) { + if (S_ISREG(inode->i_mode)) { if (inode->i_size == 0) - binode->flags &= ~(BTRFS_INODE_NODATACOW - | BTRFS_INODE_NODATASUM); + binode_flags &= ~(BTRFS_INODE_NODATACOW | + BTRFS_INODE_NODATASUM); } else { - binode->flags &= ~BTRFS_INODE_NODATACOW; + binode_flags &= ~BTRFS_INODE_NODATACOW; } } @@ -281,57 +276,61 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) * things smaller. */ if (fsflags & FS_NOCOMP_FL) { - binode->flags &= ~BTRFS_INODE_COMPRESS; - binode->flags |= BTRFS_INODE_NOCOMPRESS; - - ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0); - if (ret && ret != -ENODATA) - goto out_drop; + binode_flags &= ~BTRFS_INODE_COMPRESS; + binode_flags |= BTRFS_INODE_NOCOMPRESS; } else if (fsflags & FS_COMPR_FL) { - const char *comp; if (IS_SWAPFILE(inode)) { ret = -ETXTBSY; goto out_unlock; } - binode->flags |= BTRFS_INODE_COMPRESS; - binode->flags &= ~BTRFS_INODE_NOCOMPRESS; + binode_flags |= BTRFS_INODE_COMPRESS; + binode_flags &= ~BTRFS_INODE_NOCOMPRESS; comp = btrfs_compress_type2str(fs_info->compress_type); if (!comp || comp[0] == 0) comp = btrfs_compress_type2str(BTRFS_COMPRESS_ZLIB); - - ret = btrfs_set_prop(inode, "btrfs.compression", - comp, strlen(comp), 0); - if (ret) - goto out_drop; - } else { - ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0); - if (ret && ret != -ENODATA) - goto out_drop; - binode->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); + binode_flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); } - trans = btrfs_start_transaction(root, 1); + /* + * 1 for inode item + * 2 for properties + */ + trans = btrfs_start_transaction(root, 3); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - goto out_drop; + goto out_unlock; } + if (comp) { + ret = btrfs_set_prop(trans, inode, "btrfs.compression", comp, + strlen(comp), 0); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_end_trans; + } + set_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags); + } else { + ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL, + 0, 0); + if (ret && ret != -ENODATA) { + btrfs_abort_transaction(trans, ret); + goto out_end_trans; + } + } + + binode->flags = binode_flags; btrfs_sync_inode_flags_to_i_flags(inode); inode_inc_iversion(inode); inode->i_ctime = current_time(inode); ret = btrfs_update_inode(trans, root, inode); + out_end_trans: btrfs_end_transaction(trans); - out_drop: - if (ret) { - binode->flags = old_flags; - inode->i_flags = old_i_flags; - } - out_unlock: inode_unlock(inode); mnt_drop_write_file(file); @@ -501,6 +500,16 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + /* + * If the fs is mounted with nologreplay, which requires it to be + * mounted in RO mode as well, we can not allow discard on free space + * inside block groups, because log trees refer to extents that are not + * pinned in a block group's free space cache (pinning the extents is + * precisely the first phase of replaying a log tree). + */ + if (btrfs_test_opt(fs_info, NOLOGREPLAY)) + return -EROFS; + rcu_read_lock(); list_for_each_entry_rcu(device, &fs_info->fs_devices->devices, dev_list) { @@ -3250,6 +3259,19 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, { int ret; u64 i, tail_len, chunk_count; + struct btrfs_root *root_dst = BTRFS_I(dst)->root; + + spin_lock(&root_dst->root_item_lock); + if (root_dst->send_in_progress) { + btrfs_warn_rl(root_dst->fs_info, +"cannot deduplicate to root %llu while send operations are using it (%d in progress)", + root_dst->root_key.objectid, + root_dst->send_in_progress); + spin_unlock(&root_dst->root_item_lock); + return -EAGAIN; + } + root_dst->dedupe_in_progress++; + spin_unlock(&root_dst->root_item_lock); tail_len = olen % BTRFS_MAX_DEDUPE_LEN; chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); @@ -3258,7 +3280,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, dst, dst_loff); if (ret) - return ret; + goto out; loff += BTRFS_MAX_DEDUPE_LEN; dst_loff += BTRFS_MAX_DEDUPE_LEN; @@ -3267,6 +3289,10 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, if (tail_len > 0) ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff); +out: + spin_lock(&root_dst->root_item_lock); + root_dst->dedupe_in_progress--; + spin_unlock(&root_dst->root_item_lock); return ret; } @@ -3725,13 +3751,16 @@ process_slot: datal); if (disko) { + struct btrfs_ref ref = { 0 }; inode_add_bytes(inode, datal); - ret = btrfs_inc_extent_ref(trans, - root, - disko, diskl, 0, - root->root_key.objectid, - btrfs_ino(BTRFS_I(inode)), - new_key.offset - datao); + btrfs_init_generic_ref(&ref, + BTRFS_ADD_DELAYED_REF, disko, + diskl, 0); + btrfs_init_data_ref(&ref, + root->root_key.objectid, + btrfs_ino(BTRFS_I(inode)), + new_key.offset - datao); + ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -3938,16 +3967,10 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, return -EXDEV; } - if (same_inode) - inode_lock(inode_in); - else - lock_two_nondirectories(inode_in, inode_out); - /* don't make the dst file partly checksummed */ if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) != (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) { - ret = -EINVAL; - goto out_unlock; + return -EINVAL; } /* @@ -3981,26 +4004,14 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), wb_len); if (ret < 0) - goto out_unlock; + return ret; ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), wb_len); if (ret < 0) - goto out_unlock; + return ret; - ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, + return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, len, remap_flags); - if (ret < 0 || *len == 0) - goto out_unlock; - - return 0; - - out_unlock: - if (same_inode) - inode_unlock(inode_in); - else - unlock_two_nondirectories(inode_in, inode_out); - - return ret; } loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, @@ -4015,16 +4026,22 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) return -EINVAL; + if (same_inode) + inode_lock(src_inode); + else + lock_two_nondirectories(src_inode, dst_inode); + ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff, &len, remap_flags); if (ret < 0 || len == 0) - return ret; + goto out_unlock; if (remap_flags & REMAP_FILE_DEDUP) ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff); else ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); +out_unlock: if (same_inode) inode_unlock(src_inode); else diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 82b84e4daad1..2f6c3c7851ed 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -12,10 +12,82 @@ #include "extent_io.h" #include "locking.h" -static void btrfs_assert_tree_read_locked(struct extent_buffer *eb); +#ifdef CONFIG_BTRFS_DEBUG +static void btrfs_assert_spinning_writers_get(struct extent_buffer *eb) +{ + WARN_ON(atomic_read(&eb->spinning_writers)); + atomic_inc(&eb->spinning_writers); +} + +static void btrfs_assert_spinning_writers_put(struct extent_buffer *eb) +{ + WARN_ON(atomic_read(&eb->spinning_writers) != 1); + atomic_dec(&eb->spinning_writers); +} + +static void btrfs_assert_no_spinning_writers(struct extent_buffer *eb) +{ + WARN_ON(atomic_read(&eb->spinning_writers)); +} + +static void btrfs_assert_spinning_readers_get(struct extent_buffer *eb) +{ + atomic_inc(&eb->spinning_readers); +} + +static void btrfs_assert_spinning_readers_put(struct extent_buffer *eb) +{ + WARN_ON(atomic_read(&eb->spinning_readers) == 0); + atomic_dec(&eb->spinning_readers); +} + +static void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb) +{ + atomic_inc(&eb->read_locks); +} + +static void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb) +{ + atomic_dec(&eb->read_locks); +} + +static void btrfs_assert_tree_read_locked(struct extent_buffer *eb) +{ + BUG_ON(!atomic_read(&eb->read_locks)); +} + +static void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb) +{ + atomic_inc(&eb->write_locks); +} + +static void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) +{ + atomic_dec(&eb->write_locks); +} + +void btrfs_assert_tree_locked(struct extent_buffer *eb) +{ + BUG_ON(!atomic_read(&eb->write_locks)); +} + +#else +static void btrfs_assert_spinning_writers_get(struct extent_buffer *eb) { } +static void btrfs_assert_spinning_writers_put(struct extent_buffer *eb) { } +static void btrfs_assert_no_spinning_writers(struct extent_buffer *eb) { } +static void btrfs_assert_spinning_readers_put(struct extent_buffer *eb) { } +static void btrfs_assert_spinning_readers_get(struct extent_buffer *eb) { } +static void btrfs_assert_tree_read_locked(struct extent_buffer *eb) { } +static void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb) { } +static void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb) { } +void btrfs_assert_tree_locked(struct extent_buffer *eb) { } +static void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb) { } +static void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) { } +#endif void btrfs_set_lock_blocking_read(struct extent_buffer *eb) { + trace_btrfs_set_lock_blocking_read(eb); /* * No lock is required. The lock owner may change if we have a read * lock, but it won't change to or away from us. If we have the write @@ -25,13 +97,13 @@ void btrfs_set_lock_blocking_read(struct extent_buffer *eb) return; btrfs_assert_tree_read_locked(eb); atomic_inc(&eb->blocking_readers); - WARN_ON(atomic_read(&eb->spinning_readers) == 0); - atomic_dec(&eb->spinning_readers); + btrfs_assert_spinning_readers_put(eb); read_unlock(&eb->lock); } void btrfs_set_lock_blocking_write(struct extent_buffer *eb) { + trace_btrfs_set_lock_blocking_write(eb); /* * No lock is required. The lock owner may change if we have a read * lock, but it won't change to or away from us. If we have the write @@ -40,8 +112,7 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb) if (eb->lock_nested && current->pid == eb->lock_owner) return; if (atomic_read(&eb->blocking_writers) == 0) { - WARN_ON(atomic_read(&eb->spinning_writers) != 1); - atomic_dec(&eb->spinning_writers); + btrfs_assert_spinning_writers_put(eb); btrfs_assert_tree_locked(eb); atomic_inc(&eb->blocking_writers); write_unlock(&eb->lock); @@ -50,6 +121,7 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb) void btrfs_clear_lock_blocking_read(struct extent_buffer *eb) { + trace_btrfs_clear_lock_blocking_read(eb); /* * No lock is required. The lock owner may change if we have a read * lock, but it won't change to or away from us. If we have the write @@ -59,7 +131,7 @@ void btrfs_clear_lock_blocking_read(struct extent_buffer *eb) return; BUG_ON(atomic_read(&eb->blocking_readers) == 0); read_lock(&eb->lock); - atomic_inc(&eb->spinning_readers); + btrfs_assert_spinning_readers_get(eb); /* atomic_dec_and_test implies a barrier */ if (atomic_dec_and_test(&eb->blocking_readers)) cond_wake_up_nomb(&eb->read_lock_wq); @@ -67,6 +139,7 @@ void btrfs_clear_lock_blocking_read(struct extent_buffer *eb) void btrfs_clear_lock_blocking_write(struct extent_buffer *eb) { + trace_btrfs_clear_lock_blocking_write(eb); /* * no lock is required. The lock owner may change if * we have a read lock, but it won't change to or away @@ -77,8 +150,7 @@ void btrfs_clear_lock_blocking_write(struct extent_buffer *eb) return; BUG_ON(atomic_read(&eb->blocking_writers) != 1); write_lock(&eb->lock); - WARN_ON(atomic_read(&eb->spinning_writers)); - atomic_inc(&eb->spinning_writers); + btrfs_assert_spinning_writers_get(eb); /* atomic_dec_and_test implies a barrier */ if (atomic_dec_and_test(&eb->blocking_writers)) cond_wake_up_nomb(&eb->write_lock_wq); @@ -90,6 +162,10 @@ void btrfs_clear_lock_blocking_write(struct extent_buffer *eb) */ void btrfs_tree_read_lock(struct extent_buffer *eb) { + u64 start_ns = 0; + + if (trace_btrfs_tree_read_lock_enabled()) + start_ns = ktime_get_ns(); again: BUG_ON(!atomic_read(&eb->blocking_writers) && current->pid == eb->lock_owner); @@ -104,8 +180,9 @@ again: * called on a partly (write-)locked tree. */ BUG_ON(eb->lock_nested); - eb->lock_nested = 1; + eb->lock_nested = true; read_unlock(&eb->lock); + trace_btrfs_tree_read_lock(eb, start_ns); return; } if (atomic_read(&eb->blocking_writers)) { @@ -114,8 +191,9 @@ again: atomic_read(&eb->blocking_writers) == 0); goto again; } - atomic_inc(&eb->read_locks); - atomic_inc(&eb->spinning_readers); + btrfs_assert_tree_read_locks_get(eb); + btrfs_assert_spinning_readers_get(eb); + trace_btrfs_tree_read_lock(eb, start_ns); } /* @@ -133,8 +211,9 @@ int btrfs_tree_read_lock_atomic(struct extent_buffer *eb) read_unlock(&eb->lock); return 0; } - atomic_inc(&eb->read_locks); - atomic_inc(&eb->spinning_readers); + btrfs_assert_tree_read_locks_get(eb); + btrfs_assert_spinning_readers_get(eb); + trace_btrfs_tree_read_lock_atomic(eb); return 1; } @@ -154,8 +233,9 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb) read_unlock(&eb->lock); return 0; } - atomic_inc(&eb->read_locks); - atomic_inc(&eb->spinning_readers); + btrfs_assert_tree_read_locks_get(eb); + btrfs_assert_spinning_readers_get(eb); + trace_btrfs_try_tree_read_lock(eb); return 1; } @@ -175,9 +255,10 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb) write_unlock(&eb->lock); return 0; } - atomic_inc(&eb->write_locks); - atomic_inc(&eb->spinning_writers); + btrfs_assert_tree_write_locks_get(eb); + btrfs_assert_spinning_writers_get(eb); eb->lock_owner = current->pid; + trace_btrfs_try_tree_write_lock(eb); return 1; } @@ -186,6 +267,7 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb) */ void btrfs_tree_read_unlock(struct extent_buffer *eb) { + trace_btrfs_tree_read_unlock(eb); /* * if we're nested, we have the write lock. No new locking * is needed as long as we are the lock owner. @@ -193,13 +275,12 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb) * field only matters to the lock owner. */ if (eb->lock_nested && current->pid == eb->lock_owner) { - eb->lock_nested = 0; + eb->lock_nested = false; return; } btrfs_assert_tree_read_locked(eb); - WARN_ON(atomic_read(&eb->spinning_readers) == 0); - atomic_dec(&eb->spinning_readers); - atomic_dec(&eb->read_locks); + btrfs_assert_spinning_readers_put(eb); + btrfs_assert_tree_read_locks_put(eb); read_unlock(&eb->lock); } @@ -208,6 +289,7 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb) */ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) { + trace_btrfs_tree_read_unlock_blocking(eb); /* * if we're nested, we have the write lock. No new locking * is needed as long as we are the lock owner. @@ -215,7 +297,7 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) * field only matters to the lock owner. */ if (eb->lock_nested && current->pid == eb->lock_owner) { - eb->lock_nested = 0; + eb->lock_nested = false; return; } btrfs_assert_tree_read_locked(eb); @@ -223,7 +305,7 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) /* atomic_dec_and_test implies a barrier */ if (atomic_dec_and_test(&eb->blocking_readers)) cond_wake_up_nomb(&eb->read_lock_wq); - atomic_dec(&eb->read_locks); + btrfs_assert_tree_read_locks_put(eb); } /* @@ -232,6 +314,11 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) */ void btrfs_tree_lock(struct extent_buffer *eb) { + u64 start_ns = 0; + + if (trace_btrfs_tree_lock_enabled()) + start_ns = ktime_get_ns(); + WARN_ON(eb->lock_owner == current->pid); again: wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0); @@ -242,10 +329,10 @@ again: write_unlock(&eb->lock); goto again; } - WARN_ON(atomic_read(&eb->spinning_writers)); - atomic_inc(&eb->spinning_writers); - atomic_inc(&eb->write_locks); + btrfs_assert_spinning_writers_get(eb); + btrfs_assert_tree_write_locks_get(eb); eb->lock_owner = current->pid; + trace_btrfs_tree_lock(eb, start_ns); } /* @@ -258,28 +345,18 @@ void btrfs_tree_unlock(struct extent_buffer *eb) BUG_ON(blockers > 1); btrfs_assert_tree_locked(eb); + trace_btrfs_tree_unlock(eb); eb->lock_owner = 0; - atomic_dec(&eb->write_locks); + btrfs_assert_tree_write_locks_put(eb); if (blockers) { - WARN_ON(atomic_read(&eb->spinning_writers)); + btrfs_assert_no_spinning_writers(eb); atomic_dec(&eb->blocking_writers); /* Use the lighter barrier after atomic */ smp_mb__after_atomic(); cond_wake_up_nomb(&eb->write_lock_wq); } else { - WARN_ON(atomic_read(&eb->spinning_writers) != 1); - atomic_dec(&eb->spinning_writers); + btrfs_assert_spinning_writers_put(eb); write_unlock(&eb->lock); } } - -void btrfs_assert_tree_locked(struct extent_buffer *eb) -{ - BUG_ON(!atomic_read(&eb->write_locks)); -} - -static void btrfs_assert_tree_read_locked(struct extent_buffer *eb) -{ - BUG_ON(!atomic_read(&eb->read_locks)); -} diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 6fde2b2741ef..52889da69113 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -6,6 +6,7 @@ #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/writeback.h> +#include <linux/sched/mm.h> #include "ctree.h" #include "transaction.h" #include "btrfs_inode.h" @@ -194,8 +195,11 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) set_bit(type, &entry->flags); - if (dio) + if (dio) { + percpu_counter_add_batch(&fs_info->dio_bytes, len, + fs_info->delalloc_batch); set_bit(BTRFS_ORDERED_DIRECT, &entry->flags); + } /* one ref for the tree */ refcount_set(&entry->refs, 1); @@ -270,13 +274,12 @@ int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, * when an ordered extent is finished. If the list covers more than one * ordered extent, it is split across multiples. */ -void btrfs_add_ordered_sum(struct inode *inode, - struct btrfs_ordered_extent *entry, +void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum) { struct btrfs_ordered_inode_tree *tree; - tree = &BTRFS_I(inode)->ordered_tree; + tree = &BTRFS_I(entry->inode)->ordered_tree; spin_lock_irq(&tree->lock); list_add_tail(&sum->list, &entry->list); spin_unlock_irq(&tree->lock); @@ -442,7 +445,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) cur = entry->list.next; sum = list_entry(cur, struct btrfs_ordered_sum, list); list_del(&sum->list); - kfree(sum); + kvfree(sum); } kmem_cache_free(btrfs_ordered_extent_cache, entry); } @@ -468,6 +471,10 @@ void btrfs_remove_ordered_extent(struct inode *inode, if (root != fs_info->tree_root) btrfs_delalloc_release_metadata(btrfs_inode, entry->len, false); + if (test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) + percpu_counter_add_batch(&fs_info->dio_bytes, -entry->len, + fs_info->delalloc_batch); + tree = &btrfs_inode->ordered_tree; spin_lock_irq(&tree->lock); node = &entry->rb_node; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index fb9a161f0215..4c5991c3de14 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -167,8 +167,7 @@ int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int type, int compress_type); -void btrfs_add_ordered_sum(struct inode *inode, - struct btrfs_ordered_extent *entry, +void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, u64 file_offset); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index df49931ffe92..1141ca5fae6a 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -189,7 +189,7 @@ void btrfs_print_leaf(struct extent_buffer *l) btrfs_info(fs_info, "leaf %llu gen %llu total ptrs %d free space %d owner %llu", btrfs_header_bytenr(l), btrfs_header_generation(l), nr, - btrfs_leaf_free_space(fs_info, l), btrfs_header_owner(l)); + btrfs_leaf_free_space(l), btrfs_header_owner(l)); print_eb_refs_lock(l); for (i = 0 ; i < nr ; i++) { item = btrfs_item_nr(i); diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index dc6140013ae8..ca2716917e37 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -23,36 +23,6 @@ struct prop_handler { int inheritable; }; -static int prop_compression_validate(const char *value, size_t len); -static int prop_compression_apply(struct inode *inode, - const char *value, - size_t len); -static const char *prop_compression_extract(struct inode *inode); - -static struct prop_handler prop_handlers[] = { - { - .xattr_name = XATTR_BTRFS_PREFIX "compression", - .validate = prop_compression_validate, - .apply = prop_compression_apply, - .extract = prop_compression_extract, - .inheritable = 1 - }, -}; - -void __init btrfs_props_init(void) -{ - int i; - - hash_init(prop_handlers_ht); - - for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) { - struct prop_handler *p = &prop_handlers[i]; - u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name)); - - hash_add(prop_handlers_ht, &p->node, h); - } -} - static const struct hlist_head *find_prop_handlers_by_hash(const u64 hash) { struct hlist_head *h; @@ -85,15 +55,9 @@ find_prop_handler(const char *name, return NULL; } -static int __btrfs_set_prop(struct btrfs_trans_handle *trans, - struct inode *inode, - const char *name, - const char *value, - size_t value_len, - int flags) +int btrfs_validate_prop(const char *name, const char *value, size_t value_len) { const struct prop_handler *handler; - int ret; if (strlen(name) <= XATTR_BTRFS_PREFIX_LEN) return -EINVAL; @@ -102,9 +66,26 @@ static int __btrfs_set_prop(struct btrfs_trans_handle *trans, if (!handler) return -EINVAL; + if (value_len == 0) + return 0; + + return handler->validate(value, value_len); +} + +int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode, + const char *name, const char *value, size_t value_len, + int flags) +{ + const struct prop_handler *handler; + int ret; + + handler = find_prop_handler(name, NULL); + if (!handler) + return -EINVAL; + if (value_len == 0) { ret = btrfs_setxattr(trans, inode, handler->xattr_name, - NULL, 0, flags); + NULL, 0, flags); if (ret) return ret; @@ -114,17 +95,14 @@ static int __btrfs_set_prop(struct btrfs_trans_handle *trans, return ret; } - ret = handler->validate(value, value_len); - if (ret) - return ret; - ret = btrfs_setxattr(trans, inode, handler->xattr_name, - value, value_len, flags); + ret = btrfs_setxattr(trans, inode, handler->xattr_name, value, + value_len, flags); if (ret) return ret; ret = handler->apply(inode, value, value_len); if (ret) { - btrfs_setxattr(trans, inode, handler->xattr_name, - NULL, 0, flags); + btrfs_setxattr(trans, inode, handler->xattr_name, NULL, + 0, flags); return ret; } @@ -133,15 +111,6 @@ static int __btrfs_set_prop(struct btrfs_trans_handle *trans, return 0; } -int btrfs_set_prop(struct inode *inode, - const char *name, - const char *value, - size_t value_len, - int flags) -{ - return __btrfs_set_prop(NULL, inode, name, value, value_len, flags); -} - static int iterate_object_props(struct btrfs_root *root, struct btrfs_path *path, u64 objectid, @@ -283,6 +252,78 @@ int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path) return ret; } +static int prop_compression_validate(const char *value, size_t len) +{ + if (!value) + return 0; + + if (!strncmp("lzo", value, 3)) + return 0; + else if (!strncmp("zlib", value, 4)) + return 0; + else if (!strncmp("zstd", value, 4)) + return 0; + + return -EINVAL; +} + +static int prop_compression_apply(struct inode *inode, const char *value, + size_t len) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + int type; + + if (len == 0) { + BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; + BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; + BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE; + + return 0; + } + + if (!strncmp("lzo", value, 3)) { + type = BTRFS_COMPRESS_LZO; + btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); + } else if (!strncmp("zlib", value, 4)) { + type = BTRFS_COMPRESS_ZLIB; + } else if (!strncmp("zstd", value, 4)) { + type = BTRFS_COMPRESS_ZSTD; + btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); + } else { + return -EINVAL; + } + + BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; + BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; + BTRFS_I(inode)->prop_compress = type; + + return 0; +} + +static const char *prop_compression_extract(struct inode *inode) +{ + switch (BTRFS_I(inode)->prop_compress) { + case BTRFS_COMPRESS_ZLIB: + case BTRFS_COMPRESS_LZO: + case BTRFS_COMPRESS_ZSTD: + return btrfs_compress_type2str(BTRFS_I(inode)->prop_compress); + default: + break; + } + + return NULL; +} + +static struct prop_handler prop_handlers[] = { + { + .xattr_name = XATTR_BTRFS_PREFIX "compression", + .validate = prop_compression_validate, + .apply = prop_compression_apply, + .extract = prop_compression_extract, + .inheritable = 1 + }, +}; + static int inherit_props(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *parent) @@ -308,20 +349,38 @@ static int inherit_props(struct btrfs_trans_handle *trans, if (!value) continue; + /* + * This is not strictly necessary as the property should be + * valid, but in case it isn't, don't propagate it futher. + */ + ret = h->validate(value, strlen(value)); + if (ret) + continue; + num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); ret = btrfs_block_rsv_add(root, trans->block_rsv, num_bytes, BTRFS_RESERVE_NO_FLUSH); if (ret) - goto out; - ret = __btrfs_set_prop(trans, inode, h->xattr_name, - value, strlen(value), 0); + return ret; + + ret = btrfs_setxattr(trans, inode, h->xattr_name, value, + strlen(value), 0); + if (!ret) { + ret = h->apply(inode, value, strlen(value)); + if (ret) + btrfs_setxattr(trans, inode, h->xattr_name, + NULL, 0, 0); + else + set_bit(BTRFS_INODE_HAS_PROPS, + &BTRFS_I(inode)->runtime_flags); + } + btrfs_block_rsv_release(fs_info, trans->block_rsv, num_bytes); if (ret) - goto out; + return ret; } - ret = 0; -out: - return ret; + + return 0; } int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, @@ -364,64 +423,17 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, return ret; } -static int prop_compression_validate(const char *value, size_t len) -{ - if (!strncmp("lzo", value, len)) - return 0; - else if (!strncmp("zlib", value, len)) - return 0; - else if (!strncmp("zstd", value, len)) - return 0; - - return -EINVAL; -} - -static int prop_compression_apply(struct inode *inode, - const char *value, - size_t len) +void __init btrfs_props_init(void) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int type; - - if (len == 0) { - BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; - BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; - BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE; - - return 0; - } - - if (!strncmp("lzo", value, 3)) { - type = BTRFS_COMPRESS_LZO; - btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); - } else if (!strncmp("zlib", value, 4)) { - type = BTRFS_COMPRESS_ZLIB; - } else if (!strncmp("zstd", value, len)) { - type = BTRFS_COMPRESS_ZSTD; - btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); - } else { - return -EINVAL; - } + int i; - BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; - BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; - BTRFS_I(inode)->prop_compress = type; + hash_init(prop_handlers_ht); - return 0; -} + for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) { + struct prop_handler *p = &prop_handlers[i]; + u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name)); -static const char *prop_compression_extract(struct inode *inode) -{ - switch (BTRFS_I(inode)->prop_compress) { - case BTRFS_COMPRESS_ZLIB: - case BTRFS_COMPRESS_LZO: - case BTRFS_COMPRESS_ZSTD: - return btrfs_compress_type2str(BTRFS_I(inode)->prop_compress); - default: - break; + hash_add(prop_handlers_ht, &p->node, h); } - - return NULL; } - diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h index 618815b4f9d5..40b2c65b518c 100644 --- a/fs/btrfs/props.h +++ b/fs/btrfs/props.h @@ -10,11 +10,10 @@ void __init btrfs_props_init(void); -int btrfs_set_prop(struct inode *inode, - const char *name, - const char *value, - size_t value_len, +int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode, + const char *name, const char *value, size_t value_len, int flags); +int btrfs_validate_prop(const char *name, const char *value, size_t value_len); int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index e659d9d61107..2f708f2c4e67 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -918,8 +918,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) /* * initially create the quota tree */ - quota_root = btrfs_create_tree(trans, fs_info, - BTRFS_QUOTA_TREE_OBJECTID); + quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID); if (IS_ERR(quota_root)) { ret = PTR_ERR(quota_root); btrfs_abort_transaction(trans, ret); @@ -1101,7 +1100,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) list_del("a_root->dirty_list); btrfs_tree_lock(quota_root->node); - clean_tree_block(fs_info, quota_root->node); + btrfs_clean_tree_block(quota_root->node); btrfs_tree_unlock(quota_root->node); btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 67a6f7d47402..f3d0576dd327 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1442,12 +1442,11 @@ static int fail_bio_stripe(struct btrfs_raid_bio *rbio, static void set_bio_pages_uptodate(struct bio *bio) { struct bio_vec *bvec; - int i; struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, i, iter_all) + bio_for_each_segment_all(bvec, bio, iter_all) SetPageUptodate(bvec->bv_page); } diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index d09b6cdb785a..e87cbdad02a3 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -205,28 +205,17 @@ static struct root_entry *lookup_root_entry(struct rb_root *root, u64 objectid) #ifdef CONFIG_STACKTRACE static void __save_stack_trace(struct ref_action *ra) { - struct stack_trace stack_trace; - - stack_trace.max_entries = MAX_TRACE; - stack_trace.nr_entries = 0; - stack_trace.entries = ra->trace; - stack_trace.skip = 2; - save_stack_trace(&stack_trace); - ra->trace_len = stack_trace.nr_entries; + ra->trace_len = stack_trace_save(ra->trace, MAX_TRACE, 2); } static void __print_stack_trace(struct btrfs_fs_info *fs_info, struct ref_action *ra) { - struct stack_trace trace; - if (ra->trace_len == 0) { btrfs_err(fs_info, " ref-verify: no stacktrace"); return; } - trace.nr_entries = ra->trace_len; - trace.entries = ra->trace; - print_stack_trace(&trace, 2); + stack_trace_print(ra->trace, ra->trace_len, 2); } #else static void inline __save_stack_trace(struct ref_action *ra) @@ -520,6 +509,7 @@ static int process_leaf(struct btrfs_root *root, switch (key.type) { case BTRFS_EXTENT_ITEM_KEY: *num_bytes = key.offset; + /* fall through */ case BTRFS_METADATA_ITEM_KEY: *bytenr = key.objectid; ret = process_extent_item(fs_info, path, &key, i, @@ -670,36 +660,43 @@ static void dump_block_entry(struct btrfs_fs_info *fs_info, /* * btrfs_ref_tree_mod: called when we modify a ref for a bytenr - * @root: the root we are making this modification from. - * @bytenr: the bytenr we are modifying. - * @num_bytes: number of bytes. - * @parent: the parent bytenr. - * @ref_root: the original root owner of the bytenr. - * @owner: level in the case of metadata, inode in the case of data. - * @offset: 0 for metadata, file offset for data. - * @action: the action that we are doing, this is the same as the delayed ref - * action. * * This will add an action item to the given bytenr and do sanity checks to make * sure we haven't messed something up. If we are making a new allocation and * this block entry has history we will delete all previous actions as long as * our sanity checks pass as they are no longer needed. */ -int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, - u64 parent, u64 ref_root, u64 owner, u64 offset, - int action) +int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, + struct btrfs_ref *generic_ref) { - struct btrfs_fs_info *fs_info = root->fs_info; struct ref_entry *ref = NULL, *exist; struct ref_action *ra = NULL; struct block_entry *be = NULL; struct root_entry *re = NULL; + int action = generic_ref->action; int ret = 0; - bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID; + bool metadata; + u64 bytenr = generic_ref->bytenr; + u64 num_bytes = generic_ref->len; + u64 parent = generic_ref->parent; + u64 ref_root; + u64 owner; + u64 offset; - if (!btrfs_test_opt(root->fs_info, REF_VERIFY)) + if (!btrfs_test_opt(fs_info, REF_VERIFY)) return 0; + if (generic_ref->type == BTRFS_REF_METADATA) { + ref_root = generic_ref->tree_ref.root; + owner = generic_ref->tree_ref.level; + offset = 0; + } else { + ref_root = generic_ref->data_ref.ref_root; + owner = generic_ref->data_ref.ino; + offset = generic_ref->data_ref.offset; + } + metadata = owner < BTRFS_FIRST_FREE_OBJECTID; + ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS); ra = kmalloc(sizeof(struct ref_action), GFP_NOFS); if (!ra || !ref) { @@ -732,7 +729,7 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, INIT_LIST_HEAD(&ra->list); ra->action = action; - ra->root = root->root_key.objectid; + ra->root = generic_ref->real_root; /* * This is an allocation, preallocate the block_entry in case we haven't @@ -745,7 +742,7 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, * is and the new root objectid, so let's not treat the passed * in root as if it really has a ref for this bytenr. */ - be = add_block_entry(root->fs_info, bytenr, num_bytes, ref_root); + be = add_block_entry(fs_info, bytenr, num_bytes, ref_root); if (IS_ERR(be)) { kfree(ra); ret = PTR_ERR(be); @@ -787,13 +784,13 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, * one we want to lookup below when we modify the * re->num_refs. */ - ref_root = root->root_key.objectid; - re->root_objectid = root->root_key.objectid; + ref_root = generic_ref->real_root; + re->root_objectid = generic_ref->real_root; re->num_refs = 0; } - spin_lock(&root->fs_info->ref_verify_lock); - be = lookup_block_entry(&root->fs_info->block_tree, bytenr); + spin_lock(&fs_info->ref_verify_lock); + be = lookup_block_entry(&fs_info->block_tree, bytenr); if (!be) { btrfs_err(fs_info, "trying to do action %d to bytenr %llu num_bytes %llu but there is no existing entry!", @@ -862,7 +859,7 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, * didn't think of some other corner case. */ btrfs_err(fs_info, "failed to find root %llu for %llu", - root->root_key.objectid, be->bytenr); + generic_ref->real_root, be->bytenr); dump_block_entry(fs_info, be); dump_ref_action(fs_info, ra); kfree(ra); @@ -881,7 +878,7 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, list_add_tail(&ra->list, &be->actions); ret = 0; out_unlock: - spin_unlock(&root->fs_info->ref_verify_lock); + spin_unlock(&fs_info->ref_verify_lock); out: if (ret) btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h index b7d2a4edfdb7..855de37719b5 100644 --- a/fs/btrfs/ref-verify.h +++ b/fs/btrfs/ref-verify.h @@ -9,9 +9,8 @@ #ifdef CONFIG_BTRFS_FS_REF_VERIFY int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info); void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info); -int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, - u64 parent, u64 ref_root, u64 owner, u64 offset, - int action); +int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, + struct btrfs_ref *generic_ref); void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start, u64 len); @@ -30,9 +29,8 @@ static inline void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info) { } -static inline int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 parent, u64 ref_root, - u64 owner, u64 offset, int action) +static inline int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, + struct btrfs_ref *generic_ref) { return 0; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index ddf028509931..a459ecddcce4 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1643,6 +1643,8 @@ int replace_file_extents(struct btrfs_trans_handle *trans, nritems = btrfs_header_nritems(leaf); for (i = 0; i < nritems; i++) { + struct btrfs_ref ref = { 0 }; + cond_resched(); btrfs_item_key_to_cpu(leaf, &key, i); if (key.type != BTRFS_EXTENT_DATA_KEY) @@ -1703,18 +1705,23 @@ int replace_file_extents(struct btrfs_trans_handle *trans, dirty = 1; key.offset -= btrfs_file_extent_offset(leaf, fi); - ret = btrfs_inc_extent_ref(trans, root, new_bytenr, - num_bytes, parent, - btrfs_header_owner(leaf), - key.objectid, key.offset); + btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, + num_bytes, parent); + ref.real_root = root->root_key.objectid; + btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), + key.objectid, key.offset); + ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); break; } - ret = btrfs_free_extent(trans, root, bytenr, num_bytes, - parent, btrfs_header_owner(leaf), - key.objectid, key.offset); + btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, + num_bytes, parent); + ref.real_root = root->root_key.objectid; + btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), + key.objectid, key.offset); + ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); break; @@ -1756,6 +1763,7 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct btrfs_fs_info *fs_info = dest->fs_info; struct extent_buffer *eb; struct extent_buffer *parent; + struct btrfs_ref ref = { 0 }; struct btrfs_key key; u64 old_bytenr; u64 new_bytenr; @@ -1916,23 +1924,31 @@ again: path->slots[level], old_ptr_gen); btrfs_mark_buffer_dirty(path->nodes[level]); - ret = btrfs_inc_extent_ref(trans, src, old_bytenr, - blocksize, path->nodes[level]->start, - src->root_key.objectid, level - 1, 0); + btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr, + blocksize, path->nodes[level]->start); + ref.skip_qgroup = true; + btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid); + ret = btrfs_inc_extent_ref(trans, &ref); BUG_ON(ret); - ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, - blocksize, 0, dest->root_key.objectid, - level - 1, 0); + btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, + blocksize, 0); + ref.skip_qgroup = true; + btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid); + ret = btrfs_inc_extent_ref(trans, &ref); BUG_ON(ret); - ret = btrfs_free_extent(trans, src, new_bytenr, blocksize, - path->nodes[level]->start, - src->root_key.objectid, level - 1, 0); + btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr, + blocksize, path->nodes[level]->start); + btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid); + ref.skip_qgroup = true; + ret = btrfs_free_extent(trans, &ref); BUG_ON(ret); - ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize, - 0, dest->root_key.objectid, level - 1, - 0); + btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr, + blocksize, 0); + btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid); + ref.skip_qgroup = true; + ret = btrfs_free_extent(trans, &ref); BUG_ON(ret); btrfs_unlock_up_safe(path, 0); @@ -2721,6 +2737,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, rc->backref_cache.path[node->level] = node; list_for_each_entry(edge, &node->upper, list[LOWER]) { struct btrfs_key first_key; + struct btrfs_ref ref = { 0 }; cond_resched(); @@ -2826,11 +2843,13 @@ static int do_relocation(struct btrfs_trans_handle *trans, trans->transid); btrfs_mark_buffer_dirty(upper->eb); - ret = btrfs_inc_extent_ref(trans, root, - node->eb->start, blocksize, - upper->eb->start, - btrfs_header_owner(upper->eb), - node->level, 0); + btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, + node->eb->start, blocksize, + upper->eb->start); + ref.real_root = root->root_key.objectid; + btrfs_init_tree_ref(&ref, node->level, + btrfs_header_owner(upper->eb)); + ret = btrfs_inc_extent_ref(trans, &ref); BUG_ON(ret); ret = btrfs_drop_subtree(trans, root, eb, upper->eb); @@ -4222,7 +4241,7 @@ out: return inode; } -static struct reloc_control *alloc_reloc_control(void) +static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) { struct reloc_control *rc; @@ -4234,7 +4253,8 @@ static struct reloc_control *alloc_reloc_control(void) INIT_LIST_HEAD(&rc->dirty_subvol_roots); backref_cache_init(&rc->backref_cache); mapping_tree_init(&rc->reloc_root_tree); - extent_io_tree_init(&rc->processed_blocks, NULL); + extent_io_tree_init(fs_info, &rc->processed_blocks, + IO_TREE_RELOC_BLOCKS, NULL); return rc; } @@ -4276,7 +4296,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) return -ETXTBSY; } - rc = alloc_reloc_control(); + rc = alloc_reloc_control(fs_info); if (!rc) { btrfs_put_block_group(bg); return -ENOMEM; @@ -4298,7 +4318,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) goto out; } - inode = lookup_free_space_inode(fs_info, rc->block_group, path); + inode = lookup_free_space_inode(rc->block_group, path); btrfs_free_path(path); if (!IS_ERR(inode)) @@ -4330,27 +4350,36 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) mutex_lock(&fs_info->cleaner_mutex); ret = relocate_block_group(rc); mutex_unlock(&fs_info->cleaner_mutex); - if (ret < 0) { + if (ret < 0) err = ret; - goto out; - } - - if (rc->extents_found == 0) - break; - - btrfs_info(fs_info, "found %llu extents", rc->extents_found); + /* + * We may have gotten ENOSPC after we already dirtied some + * extents. If writeout happens while we're relocating a + * different block group we could end up hitting the + * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in + * btrfs_reloc_cow_block. Make sure we write everything out + * properly so we don't trip over this problem, and then break + * out of the loop if we hit an error. + */ if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { ret = btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1); - if (ret) { + if (ret) err = ret; - goto out; - } invalidate_mapping_pages(rc->data_inode->i_mapping, 0, -1); rc->stage = UPDATE_DATA_PTRS; } + + if (err < 0) + goto out; + + if (rc->extents_found == 0) + break; + + btrfs_info(fs_info, "found %llu extents", rc->extents_found); + } WARN_ON(rc->block_group->pinned > 0); @@ -4472,7 +4501,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) if (list_empty(&reloc_roots)) goto out; - rc = alloc_reloc_control(); + rc = alloc_reloc_control(fs_info); if (!rc) { err = -ENOMEM; goto out; @@ -4594,7 +4623,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) new_bytenr = ordered->start + (sums->bytenr - disk_bytenr); sums->bytenr = new_bytenr; - btrfs_add_ordered_sum(inode, ordered, sums); + btrfs_add_ordered_sum(ordered, sums); } out: btrfs_put_ordered_extent(ordered); @@ -4667,14 +4696,12 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, u64 *bytes_to_reserve) { - struct btrfs_root *root; - struct reloc_control *rc; + struct btrfs_root *root = pending->root; + struct reloc_control *rc = root->fs_info->reloc_ctl; - root = pending->root; - if (!root->reloc_root) + if (!root->reloc_root || !rc) return; - rc = root->fs_info->reloc_ctl; if (!rc->merge_reloc_tree) return; @@ -4703,10 +4730,10 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root *root = pending->root; struct btrfs_root *reloc_root; struct btrfs_root *new_root; - struct reloc_control *rc; + struct reloc_control *rc = root->fs_info->reloc_ctl; int ret; - if (!root->reloc_root) + if (!root->reloc_root || !rc) return 0; rc = root->fs_info->reloc_ctl; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 893d12fbfda0..1b9a5d0de139 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -137,11 +137,14 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root goto out; } - if (ret != 0) { - btrfs_print_leaf(path->nodes[0]); - btrfs_crit(fs_info, "unable to update root key %llu %u %llu", - key->objectid, key->type, key->offset); - BUG_ON(1); + if (ret > 0) { + btrfs_crit(fs_info, + "unable to find root key (%llu %u %llu) in tree %llu", + key->objectid, key->type, key->offset, + root->root_key.objectid); + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + goto out; } l = path->nodes[0]; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a99588536c79..f7b29f9db5e2 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3791,7 +3791,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, struct btrfs_workqueue *scrub_parity = NULL; if (btrfs_fs_closing(fs_info)) - return -EINVAL; + return -EAGAIN; if (fs_info->nodesize > BTRFS_STRIPE_LEN) { /* @@ -3999,9 +3999,9 @@ int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) return 0; } -int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info, - struct btrfs_device *dev) +int btrfs_scrub_cancel_dev(struct btrfs_device *dev) { + struct btrfs_fs_info *fs_info = dev->fs_info; struct scrub_ctx *sctx; mutex_lock(&fs_info->scrub_lock); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 7ea2d6b1f170..dd38dfe174df 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1160,7 +1160,6 @@ out: struct backref_ctx { struct send_ctx *sctx; - struct btrfs_path *path; /* number of total found references */ u64 found; @@ -1213,8 +1212,6 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) { struct backref_ctx *bctx = ctx_; struct clone_root *found; - int ret; - u64 i_size; /* First check if the root is in the list of accepted clone sources */ found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots, @@ -1231,19 +1228,6 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) } /* - * There are inodes that have extents that lie behind its i_size. Don't - * accept clones from these extents. - */ - ret = __get_inode_info(found->root, bctx->path, ino, &i_size, NULL, NULL, - NULL, NULL, NULL); - btrfs_release_path(bctx->path); - if (ret < 0) - return ret; - - if (offset + bctx->data_offset + bctx->extent_len > i_size) - return 0; - - /* * Make sure we don't consider clones from send_root that are * behind the current inode/offset. */ @@ -1319,8 +1303,6 @@ static int find_extent_clone(struct send_ctx *sctx, goto out; } - backref_ctx->path = tmp_path; - if (data_offset >= ino_size) { /* * There may be extents that lie behind the file's size. @@ -5082,6 +5064,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *path; struct btrfs_key key; int ret; + u64 clone_src_i_size; /* * Prevent cloning from a zero offset with a length matching the sector @@ -5107,6 +5090,16 @@ static int clone_range(struct send_ctx *sctx, return -ENOMEM; /* + * There are inodes that have extents that lie behind its i_size. Don't + * accept clones from these extents. + */ + ret = __get_inode_info(clone_root->root, path, clone_root->ino, + &clone_src_i_size, NULL, NULL, NULL, NULL, NULL); + btrfs_release_path(path); + if (ret < 0) + goto out; + + /* * We can't send a clone operation for the entire range if we find * extent items in the respective range in the source file that * refer to different extents or if we find holes. @@ -5148,6 +5141,7 @@ static int clone_range(struct send_ctx *sctx, u8 type; u64 ext_len; u64 clone_len; + u64 clone_data_offset; if (slot >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(clone_root->root, path); @@ -5201,10 +5195,30 @@ static int clone_range(struct send_ctx *sctx, if (key.offset >= clone_root->offset + len) break; + if (key.offset >= clone_src_i_size) + break; + + if (key.offset + ext_len > clone_src_i_size) + ext_len = clone_src_i_size - key.offset; + + clone_data_offset = btrfs_file_extent_offset(leaf, ei); + if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) { + clone_root->offset = key.offset; + if (clone_data_offset < data_offset && + clone_data_offset + ext_len > data_offset) { + u64 extent_offset; + + extent_offset = data_offset - clone_data_offset; + ext_len -= extent_offset; + clone_data_offset += extent_offset; + clone_root->offset += extent_offset; + } + } + clone_len = min_t(u64, ext_len, len); if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte && - btrfs_file_extent_offset(leaf, ei) == data_offset) + clone_data_offset == data_offset) ret = send_clone(sctx, offset, clone_len, clone_root); else ret = send_extent_data(sctx, offset, clone_len); @@ -6579,6 +6593,38 @@ commit_trans: return btrfs_commit_transaction(trans); } +/* + * Make sure any existing dellaloc is flushed for any root used by a send + * operation so that we do not miss any data and we do not race with writeback + * finishing and changing a tree while send is using the tree. This could + * happen if a subvolume is in RW mode, has delalloc, is turned to RO mode and + * a send operation then uses the subvolume. + * After flushing delalloc ensure_commit_roots_uptodate() must be called. + */ +static int flush_delalloc_roots(struct send_ctx *sctx) +{ + struct btrfs_root *root = sctx->parent_root; + int ret; + int i; + + if (root) { + ret = btrfs_start_delalloc_snapshot(root); + if (ret) + return ret; + btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX); + } + + for (i = 0; i < sctx->clone_roots_cnt; i++) { + root = sctx->clone_roots[i].root; + ret = btrfs_start_delalloc_snapshot(root); + if (ret) + return ret; + btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX); + } + + return 0; +} + static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) { spin_lock(&root->root_item_lock); @@ -6594,6 +6640,13 @@ static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) spin_unlock(&root->root_item_lock); } +static void dedupe_in_progress_warn(const struct btrfs_root *root) +{ + btrfs_warn_rl(root->fs_info, +"cannot use root %llu for send while deduplications on it are in progress (%d in progress)", + root->root_key.objectid, root->dedupe_in_progress); +} + long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) { int ret = 0; @@ -6617,6 +6670,11 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) * making it RW. This also protects against deletion. */ spin_lock(&send_root->root_item_lock); + if (btrfs_root_readonly(send_root) && send_root->dedupe_in_progress) { + dedupe_in_progress_warn(send_root); + spin_unlock(&send_root->root_item_lock); + return -EAGAIN; + } send_root->send_in_progress++; spin_unlock(&send_root->root_item_lock); @@ -6751,6 +6809,13 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) ret = -EPERM; goto out; } + if (clone_root->dedupe_in_progress) { + dedupe_in_progress_warn(clone_root); + spin_unlock(&clone_root->root_item_lock); + srcu_read_unlock(&fs_info->subvol_srcu, index); + ret = -EAGAIN; + goto out; + } clone_root->send_in_progress++; spin_unlock(&clone_root->root_item_lock); srcu_read_unlock(&fs_info->subvol_srcu, index); @@ -6785,6 +6850,13 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) ret = -EPERM; goto out; } + if (sctx->parent_root->dedupe_in_progress) { + dedupe_in_progress_warn(sctx->parent_root); + spin_unlock(&sctx->parent_root->root_item_lock); + srcu_read_unlock(&fs_info->subvol_srcu, index); + ret = -EAGAIN; + goto out; + } spin_unlock(&sctx->parent_root->root_item_lock); srcu_read_unlock(&fs_info->subvol_srcu, index); @@ -6803,6 +6875,10 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) NULL); sort_clone_roots = 1; + ret = flush_delalloc_roots(sctx); + if (ret) + goto out; + ret = ensure_commit_roots_uptodate(sctx); if (ret) goto out; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 120e4340792a..0645ec428b4f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1400,7 +1400,7 @@ static inline int is_subvolume_inode(struct inode *inode) } static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, - const char *device_name, struct vfsmount *mnt) + struct vfsmount *mnt) { struct dentry *root; int ret; @@ -1649,7 +1649,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, } /* mount_subvol() will free subvol_name and mnt_root */ - root = mount_subvol(subvol_name, subvol_objectid, device_name, mnt_root); + root = mount_subvol(subvol_name, subvol_objectid, mnt_root); out: return root; @@ -2298,6 +2298,7 @@ static const struct super_operations btrfs_super_ops = { .show_devname = btrfs_show_devname, .alloc_inode = btrfs_alloc_inode, .destroy_inode = btrfs_destroy_inode, + .free_inode = btrfs_free_inode, .statfs = btrfs_statfs, .remount_fs = btrfs_remount, .freeze_fs = btrfs_freeze, diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 8a59597f1883..9238fd4f1734 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -17,6 +17,16 @@ static struct vfsmount *test_mnt = NULL; +const char *test_error[] = { + [TEST_ALLOC_FS_INFO] = "cannot allocate fs_info", + [TEST_ALLOC_ROOT] = "cannot allocate root", + [TEST_ALLOC_EXTENT_BUFFER] = "cannot extent buffer", + [TEST_ALLOC_PATH] = "cannot allocate path", + [TEST_ALLOC_INODE] = "cannot allocate inode", + [TEST_ALLOC_BLOCK_GROUP] = "cannot allocate block group", + [TEST_ALLOC_EXTENT_MAP] = "cannot allocate extent map", +}; + static const struct super_operations btrfs_test_super_ops = { .alloc_inode = btrfs_alloc_inode, .destroy_inode = btrfs_test_destroy_inode, @@ -99,7 +109,6 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->qgroup_lock); - spin_lock_init(&fs_info->qgroup_op_lock); spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->tree_mod_seq_lock); @@ -115,8 +124,10 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); - extent_io_tree_init(&fs_info->freed_extents[0], NULL); - extent_io_tree_init(&fs_info->freed_extents[1], NULL); + extent_io_tree_init(fs_info, &fs_info->freed_extents[0], + IO_TREE_FS_INFO_FREED_EXTENTS0, NULL); + extent_io_tree_init(fs_info, &fs_info->freed_extents[1], + IO_TREE_FS_INFO_FREED_EXTENTS1, NULL); fs_info->pinned_extents = &fs_info->freed_extents[0]; set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index 70ff9f9d86a1..ee277bbd939b 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -10,7 +10,22 @@ int btrfs_run_sanity_tests(void); #define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt "\n", ##__VA_ARGS__) -#define test_err(fmt, ...) pr_err("BTRFS: selftest: " fmt "\n", ##__VA_ARGS__) +#define test_err(fmt, ...) pr_err("BTRFS: selftest: %s:%d " fmt "\n", \ + __FILE__, __LINE__, ##__VA_ARGS__) + +#define test_std_err(index) test_err("%s", test_error[index]) + +enum { + TEST_ALLOC_FS_INFO, + TEST_ALLOC_ROOT, + TEST_ALLOC_EXTENT_BUFFER, + TEST_ALLOC_PATH, + TEST_ALLOC_INODE, + TEST_ALLOC_BLOCK_GROUP, + TEST_ALLOC_EXTENT_MAP, +}; + +extern const char *test_error[]; struct btrfs_root; struct btrfs_trans_handle; diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index 7d72eab6d32c..a1b9f9b5978e 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -30,27 +30,27 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) { - test_err("could not allocate fs_info"); + test_std_err(TEST_ALLOC_FS_INFO); return -ENOMEM; } root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(root)) { - test_err("could not allocate root"); + test_std_err(TEST_ALLOC_ROOT); ret = PTR_ERR(root); goto out; } path = btrfs_alloc_path(); if (!path) { - test_err("could not allocate path"); + test_std_err(TEST_ALLOC_PATH); ret = -ENOMEM; goto out; } path->nodes[0] = eb = alloc_dummy_extent_buffer(fs_info, nodesize); if (!eb) { - test_err("could not allocate dummy buffer"); + test_std_err(TEST_ALLOC_EXTENT_BUFFER); ret = -ENOMEM; goto out; } diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 3c46d7f23456..7bf4d5734dbe 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -73,11 +73,15 @@ static int test_find_delalloc(u32 sectorsize) inode = btrfs_new_test_inode(); if (!inode) { - test_err("failed to allocate test inode"); + test_std_err(TEST_ALLOC_INODE); return -ENOMEM; } - extent_io_tree_init(&tmp, NULL); + /* + * Passing NULL as we don't have fs_info but tracepoints are not used + * at this point + */ + extent_io_tree_init(NULL, &tmp, IO_TREE_SELFTEST, NULL); /* * First go through and create and mark all of our pages dirty, we pin @@ -374,8 +378,8 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) { struct btrfs_fs_info *fs_info; unsigned long len; - unsigned long *bitmap; - struct extent_buffer *eb; + unsigned long *bitmap = NULL; + struct extent_buffer *eb = NULL; int ret; test_msg("running extent buffer bitmap tests"); @@ -388,18 +392,23 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) ? sectorsize * 4 : sectorsize; fs_info = btrfs_alloc_dummy_fs_info(len, len); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + return -ENOMEM; + } bitmap = kmalloc(len, GFP_KERNEL); if (!bitmap) { test_err("couldn't allocate test bitmap"); - return -ENOMEM; + ret = -ENOMEM; + goto out; } eb = __alloc_dummy_extent_buffer(fs_info, 0, len); if (!eb) { - test_err("couldn't allocate test extent buffer"); - kfree(bitmap); - return -ENOMEM; + test_std_err(TEST_ALLOC_ROOT); + ret = -ENOMEM; + goto out; } ret = __test_eb_bitmaps(bitmap, eb, len); @@ -408,17 +417,18 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) /* Do it over again with an extent buffer which isn't page-aligned. */ free_extent_buffer(eb); - eb = __alloc_dummy_extent_buffer(NULL, nodesize / 2, len); + eb = __alloc_dummy_extent_buffer(fs_info, nodesize / 2, len); if (!eb) { - test_err("couldn't allocate test extent buffer"); - kfree(bitmap); - return -ENOMEM; + test_std_err(TEST_ALLOC_ROOT); + ret = -ENOMEM; + goto out; } ret = __test_eb_bitmaps(bitmap, eb, len); out: free_extent_buffer(eb); kfree(bitmap); + btrfs_free_dummy_fs_info(fs_info); return ret; } @@ -434,6 +444,5 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) ret = test_eb_bitmaps(sectorsize, nodesize); out: - test_msg("extent I/O tests finished"); return ret; } diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index bf15d3a7f20e..87aeabe9d610 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -47,7 +47,7 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree) * ->add_extent_mapping(0, 16K) * -> #handle -EEXIST */ -static void test_case_1(struct btrfs_fs_info *fs_info, +static int test_case_1(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree) { struct extent_map *em; @@ -56,9 +56,10 @@ static void test_case_1(struct btrfs_fs_info *fs_info, int ret; em = alloc_extent_map(); - if (!em) - /* Skip the test on error. */ - return; + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + return -ENOMEM; + } /* Add [0, 16K) */ em->start = 0; @@ -66,25 +67,37 @@ static void test_case_1(struct btrfs_fs_info *fs_info, em->block_start = 0; em->block_len = SZ_16K; ret = add_extent_mapping(em_tree, em, 0); - ASSERT(ret == 0); + if (ret < 0) { + test_err("cannot add extent range [0, 16K)"); + goto out; + } free_extent_map(em); /* Add [16K, 20K) following [0, 16K) */ em = alloc_extent_map(); - if (!em) + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; goto out; + } em->start = SZ_16K; em->len = SZ_4K; em->block_start = SZ_32K; /* avoid merging */ em->block_len = SZ_4K; ret = add_extent_mapping(em_tree, em, 0); - ASSERT(ret == 0); + if (ret < 0) { + test_err("cannot add extent range [16K, 20K)"); + goto out; + } free_extent_map(em); em = alloc_extent_map(); - if (!em) + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; goto out; + } /* Add [0, 8K), should return [0, 16K) instead. */ em->start = start; @@ -92,19 +105,24 @@ static void test_case_1(struct btrfs_fs_info *fs_info, em->block_start = start; em->block_len = len; ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); - if (ret) + if (ret) { test_err("case1 [%llu %llu]: ret %d", start, start + len, ret); + goto out; + } if (em && (em->start != 0 || extent_map_end(em) != SZ_16K || - em->block_start != 0 || em->block_len != SZ_16K)) + em->block_start != 0 || em->block_len != SZ_16K)) { test_err( "case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu", start, start + len, ret, em->start, em->len, em->block_start, em->block_len); + ret = -EINVAL; + } free_extent_map(em); out: - /* free memory */ free_extent_map_tree(em_tree); + + return ret; } /* @@ -113,16 +131,17 @@ out: * Reading the inline ending up with EEXIST, ie. read an inline * extent and discard page cache and read it again. */ -static void test_case_2(struct btrfs_fs_info *fs_info, +static int test_case_2(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree) { struct extent_map *em; int ret; em = alloc_extent_map(); - if (!em) - /* Skip the test on error. */ - return; + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + return -ENOMEM; + } /* Add [0, 1K) */ em->start = 0; @@ -130,25 +149,37 @@ static void test_case_2(struct btrfs_fs_info *fs_info, em->block_start = EXTENT_MAP_INLINE; em->block_len = (u64)-1; ret = add_extent_mapping(em_tree, em, 0); - ASSERT(ret == 0); + if (ret < 0) { + test_err("cannot add extent range [0, 1K)"); + goto out; + } free_extent_map(em); - /* Add [4K, 4K) following [0, 1K) */ + /* Add [4K, 8K) following [0, 1K) */ em = alloc_extent_map(); - if (!em) + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; goto out; + } em->start = SZ_4K; em->len = SZ_4K; em->block_start = SZ_4K; em->block_len = SZ_4K; ret = add_extent_mapping(em_tree, em, 0); - ASSERT(ret == 0); + if (ret < 0) { + test_err("cannot add extent range [4K, 8K)"); + goto out; + } free_extent_map(em); em = alloc_extent_map(); - if (!em) + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; goto out; + } /* Add [0, 1K) */ em->start = 0; @@ -156,22 +187,27 @@ static void test_case_2(struct btrfs_fs_info *fs_info, em->block_start = EXTENT_MAP_INLINE; em->block_len = (u64)-1; ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); - if (ret) + if (ret) { test_err("case2 [0 1K]: ret %d", ret); + goto out; + } if (em && (em->start != 0 || extent_map_end(em) != SZ_1K || - em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1)) + em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1)) { test_err( "case2 [0 1K]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu", ret, em->start, em->len, em->block_start, em->block_len); + ret = -EINVAL; + } free_extent_map(em); out: - /* free memory */ free_extent_map_tree(em_tree); + + return ret; } -static void __test_case_3(struct btrfs_fs_info *fs_info, +static int __test_case_3(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree, u64 start) { struct extent_map *em; @@ -179,9 +215,10 @@ static void __test_case_3(struct btrfs_fs_info *fs_info, int ret; em = alloc_extent_map(); - if (!em) - /* Skip this test on error. */ - return; + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + return -ENOMEM; + } /* Add [4K, 8K) */ em->start = SZ_4K; @@ -189,12 +226,18 @@ static void __test_case_3(struct btrfs_fs_info *fs_info, em->block_start = SZ_4K; em->block_len = SZ_4K; ret = add_extent_mapping(em_tree, em, 0); - ASSERT(ret == 0); + if (ret < 0) { + test_err("cannot add extent range [4K, 8K)"); + goto out; + } free_extent_map(em); em = alloc_extent_map(); - if (!em) + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; goto out; + } /* Add [0, 16K) */ em->start = 0; @@ -202,24 +245,29 @@ static void __test_case_3(struct btrfs_fs_info *fs_info, em->block_start = 0; em->block_len = SZ_16K; ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); - if (ret) + if (ret) { test_err("case3 [0x%llx 0x%llx): ret %d", start, start + len, ret); + goto out; + } /* * Since bytes within em are contiguous, em->block_start is identical to * em->start. */ if (em && (start < em->start || start + len > extent_map_end(em) || - em->start != em->block_start || em->len != em->block_len)) + em->start != em->block_start || em->len != em->block_len)) { test_err( "case3 [0x%llx 0x%llx): ret %d em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)", start, start + len, ret, em->start, em->len, em->block_start, em->block_len); + ret = -EINVAL; + } free_extent_map(em); out: - /* free memory */ free_extent_map_tree(em_tree); + + return ret; } /* @@ -238,15 +286,23 @@ out: * -> add_extent_mapping() * -> add_extent_mapping() */ -static void test_case_3(struct btrfs_fs_info *fs_info, +static int test_case_3(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree) { - __test_case_3(fs_info, em_tree, 0); - __test_case_3(fs_info, em_tree, SZ_8K); - __test_case_3(fs_info, em_tree, (12 * 1024ULL)); + int ret; + + ret = __test_case_3(fs_info, em_tree, 0); + if (ret) + return ret; + ret = __test_case_3(fs_info, em_tree, SZ_8K); + if (ret) + return ret; + ret = __test_case_3(fs_info, em_tree, (12 * SZ_1K)); + + return ret; } -static void __test_case_4(struct btrfs_fs_info *fs_info, +static int __test_case_4(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree, u64 start) { struct extent_map *em; @@ -254,9 +310,10 @@ static void __test_case_4(struct btrfs_fs_info *fs_info, int ret; em = alloc_extent_map(); - if (!em) - /* Skip this test on error. */ - return; + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + return -ENOMEM; + } /* Add [0K, 8K) */ em->start = 0; @@ -264,44 +321,60 @@ static void __test_case_4(struct btrfs_fs_info *fs_info, em->block_start = 0; em->block_len = SZ_8K; ret = add_extent_mapping(em_tree, em, 0); - ASSERT(ret == 0); + if (ret < 0) { + test_err("cannot add extent range [0, 8K)"); + goto out; + } free_extent_map(em); em = alloc_extent_map(); - if (!em) + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; goto out; + } - /* Add [8K, 24K) */ + /* Add [8K, 32K) */ em->start = SZ_8K; - em->len = 24 * 1024ULL; + em->len = 24 * SZ_1K; em->block_start = SZ_16K; /* avoid merging */ - em->block_len = 24 * 1024ULL; + em->block_len = 24 * SZ_1K; ret = add_extent_mapping(em_tree, em, 0); - ASSERT(ret == 0); + if (ret < 0) { + test_err("cannot add extent range [8K, 32K)"); + goto out; + } free_extent_map(em); em = alloc_extent_map(); - if (!em) + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; goto out; + } /* Add [0K, 32K) */ em->start = 0; em->len = SZ_32K; em->block_start = 0; em->block_len = SZ_32K; ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); - if (ret) + if (ret) { test_err("case4 [0x%llx 0x%llx): ret %d", start, len, ret); - if (em && - (start < em->start || start + len > extent_map_end(em))) + goto out; + } + if (em && (start < em->start || start + len > extent_map_end(em))) { test_err( "case4 [0x%llx 0x%llx): ret %d, added wrong em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)", start, len, ret, em->start, em->len, em->block_start, em->block_len); + ret = -EINVAL; + } free_extent_map(em); out: - /* free memory */ free_extent_map_tree(em_tree); + + return ret; } /* @@ -329,17 +402,24 @@ out: * # handle -EEXIST when adding * # [0, 32K) */ -static void test_case_4(struct btrfs_fs_info *fs_info, +static int test_case_4(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree) { - __test_case_4(fs_info, em_tree, 0); - __test_case_4(fs_info, em_tree, SZ_4K); + int ret; + + ret = __test_case_4(fs_info, em_tree, 0); + if (ret) + return ret; + ret = __test_case_4(fs_info, em_tree, SZ_4K); + + return ret; } int btrfs_test_extent_map(void) { struct btrfs_fs_info *fs_info = NULL; struct extent_map_tree *em_tree; + int ret = 0; test_msg("running extent_map tests"); @@ -349,25 +429,32 @@ int btrfs_test_extent_map(void) */ fs_info = btrfs_alloc_dummy_fs_info(PAGE_SIZE, PAGE_SIZE); if (!fs_info) { - test_msg("Couldn't allocate dummy fs info"); + test_std_err(TEST_ALLOC_FS_INFO); return -ENOMEM; } em_tree = kzalloc(sizeof(*em_tree), GFP_KERNEL); - if (!em_tree) - /* Skip the test on error. */ + if (!em_tree) { + ret = -ENOMEM; goto out; + } extent_map_tree_init(em_tree); - test_case_1(fs_info, em_tree); - test_case_2(fs_info, em_tree); - test_case_3(fs_info, em_tree); - test_case_4(fs_info, em_tree); + ret = test_case_1(fs_info, em_tree); + if (ret) + goto out; + ret = test_case_2(fs_info, em_tree); + if (ret) + goto out; + ret = test_case_3(fs_info, em_tree); + if (ret) + goto out; + ret = test_case_4(fs_info, em_tree); - kfree(em_tree); out: + kfree(em_tree); btrfs_free_dummy_fs_info(fs_info); - return 0; + return ret; } diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index 5c2f77e9439b..af89f66f9e63 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -404,7 +404,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, }; const struct btrfs_free_space_op *orig_free_space_ops; - test_msg("running space stealing from bitmap to extent"); + test_msg("running space stealing from bitmap to extent tests"); /* * For this test, we want to ensure we end up with an extent entry @@ -834,9 +834,10 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) test_msg("running btrfs free space cache tests"); fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); - if (!fs_info) + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); return -ENOMEM; - + } /* * For ppc64 (with 64k page size), bytes per bitmap might be @@ -846,13 +847,14 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) cache = btrfs_alloc_dummy_block_group(fs_info, BITS_PER_BITMAP * sectorsize + PAGE_SIZE); if (!cache) { - test_err("couldn't run the tests"); + test_std_err(TEST_ALLOC_BLOCK_GROUP); btrfs_free_dummy_fs_info(fs_info); return 0; } root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(root)) { + test_std_err(TEST_ALLOC_ROOT); ret = PTR_ERR(root); goto out; } @@ -874,6 +876,5 @@ out: btrfs_free_dummy_block_group(cache); btrfs_free_dummy_root(root); btrfs_free_dummy_fs_info(fs_info); - test_msg("free space cache tests finished"); return ret; } diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index 89346da890cf..a90dad166971 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -30,7 +30,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans, unsigned int i; int ret; - info = search_free_space_info(trans, fs_info, cache, path, 0); + info = search_free_space_info(trans, cache, path, 0); if (IS_ERR(info)) { test_err("could not find free space info"); ret = PTR_ERR(info); @@ -115,7 +115,7 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans, u32 flags; int ret; - info = search_free_space_info(trans, fs_info, cache, path, 0); + info = search_free_space_info(trans, cache, path, 0); if (IS_ERR(info)) { test_err("could not find free space info"); btrfs_release_path(path); @@ -444,14 +444,14 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) { - test_err("couldn't allocate dummy fs info"); + test_std_err(TEST_ALLOC_FS_INFO); ret = -ENOMEM; goto out; } root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(root)) { - test_err("couldn't allocate dummy root"); + test_std_err(TEST_ALLOC_ROOT); ret = PTR_ERR(root); goto out; } @@ -463,7 +463,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, root->node = alloc_test_extent_buffer(root->fs_info, nodesize); if (!root->node) { - test_err("couldn't allocate dummy buffer"); + test_std_err(TEST_ALLOC_EXTENT_BUFFER); ret = -ENOMEM; goto out; } @@ -473,7 +473,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, cache = btrfs_alloc_dummy_block_group(fs_info, 8 * alignment); if (!cache) { - test_err("couldn't allocate dummy block group cache"); + test_std_err(TEST_ALLOC_BLOCK_GROUP); ret = -ENOMEM; goto out; } @@ -486,7 +486,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, path = btrfs_alloc_path(); if (!path) { - test_err("couldn't allocate path"); + test_std_err(TEST_ALLOC_ROOT); ret = -ENOMEM; goto out; } @@ -539,7 +539,7 @@ static int run_test_both_formats(test_func_t test_func, u32 sectorsize, ret = run_test(test_func, 0, sectorsize, nodesize, alignment); if (ret) { test_err( - "%pf failed with extents, sectorsize=%u, nodesize=%u, alignment=%u", + "%ps failed with extents, sectorsize=%u, nodesize=%u, alignment=%u", test_func, sectorsize, nodesize, alignment); test_ret = ret; } @@ -547,7 +547,7 @@ static int run_test_both_formats(test_func_t test_func, u32 sectorsize, ret = run_test(test_func, 1, sectorsize, nodesize, alignment); if (ret) { test_err( - "%pf failed with bitmaps, sectorsize=%u, nodesize=%u, alignment=%u", + "%ps failed with bitmaps, sectorsize=%u, nodesize=%u, alignment=%u", test_func, sectorsize, nodesize, alignment); test_ret = ret; } diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index af0c8e30d9e2..bc6dbd1b42fd 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -226,31 +226,34 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) u64 offset; int ret = -ENOMEM; + test_msg("running btrfs_get_extent tests"); + inode = btrfs_new_test_inode(); if (!inode) { - test_err("couldn't allocate inode"); + test_std_err(TEST_ALLOC_INODE); return ret; } + inode->i_mode = S_IFREG; BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.offset = 0; fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) { - test_err("couldn't allocate dummy fs info"); + test_std_err(TEST_ALLOC_FS_INFO); goto out; } root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(root)) { - test_err("couldn't allocate root"); + test_std_err(TEST_ALLOC_ROOT); goto out; } root->node = alloc_dummy_extent_buffer(fs_info, nodesize); if (!root->node) { - test_err("couldn't allocate dummy buffer"); + test_std_err(TEST_ALLOC_ROOT); goto out; } @@ -827,9 +830,11 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) struct extent_map *em = NULL; int ret = -ENOMEM; + test_msg("running hole first btrfs_get_extent test"); + inode = btrfs_new_test_inode(); if (!inode) { - test_err("couldn't allocate inode"); + test_std_err(TEST_ALLOC_INODE); return ret; } @@ -839,19 +844,19 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) { - test_err("couldn't allocate dummy fs info"); + test_std_err(TEST_ALLOC_FS_INFO); goto out; } root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(root)) { - test_err("couldn't allocate root"); + test_std_err(TEST_ALLOC_ROOT); goto out; } root->node = alloc_dummy_extent_buffer(fs_info, nodesize); if (!root->node) { - test_err("couldn't allocate dummy buffer"); + test_std_err(TEST_ALLOC_ROOT); goto out; } @@ -927,21 +932,23 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) struct btrfs_root *root = NULL; int ret = -ENOMEM; + test_msg("running outstanding_extents tests"); + inode = btrfs_new_test_inode(); if (!inode) { - test_err("couldn't allocate inode"); + test_std_err(TEST_ALLOC_INODE); return ret; } fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) { - test_err("couldn't allocate dummy fs info"); + test_std_err(TEST_ALLOC_FS_INFO); goto out; } root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(root)) { - test_err("couldn't allocate root"); + test_std_err(TEST_ALLOC_ROOT); goto out; } @@ -1110,17 +1117,16 @@ int btrfs_test_inodes(u32 sectorsize, u32 nodesize) { int ret; + test_msg("running inode tests"); + set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only); set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only); - test_msg("running btrfs_get_extent tests"); ret = test_btrfs_get_extent(sectorsize, nodesize); if (ret) return ret; - test_msg("running hole first btrfs_get_extent test"); ret = test_hole_first(sectorsize, nodesize); if (ret) return ret; - test_msg("running outstanding_extents tests"); return test_extent_accounting(sectorsize, nodesize); } diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 412b910b04cc..09aaca1efd62 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -32,7 +32,7 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, path = btrfs_alloc_path(); if (!path) { - test_err("couldn't allocate path"); + test_std_err(TEST_ALLOC_ROOT); return -ENOMEM; } @@ -82,7 +82,7 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, path = btrfs_alloc_path(); if (!path) { - test_err("couldn't allocate path"); + test_std_err(TEST_ALLOC_ROOT); return -ENOMEM; } @@ -132,7 +132,7 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr, path = btrfs_alloc_path(); if (!path) { - test_err("couldn't allocate path"); + test_std_err(TEST_ALLOC_ROOT); return -ENOMEM; } path->leave_spinning = 1; @@ -166,7 +166,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, path = btrfs_alloc_path(); if (!path) { - test_err("couldn't allocate path"); + test_std_err(TEST_ALLOC_ROOT); return -ENOMEM; } @@ -215,7 +215,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root, btrfs_init_dummy_trans(&trans, fs_info); - test_msg("qgroup basic add"); + test_msg("running qgroup add/remove tests"); ret = btrfs_create_qgroup(&trans, BTRFS_FS_TREE_OBJECTID); if (ret) { test_err("couldn't create a qgroup %d", ret); @@ -316,7 +316,7 @@ static int test_multiple_refs(struct btrfs_root *root, btrfs_init_dummy_trans(&trans, fs_info); - test_msg("qgroup multiple refs test"); + test_msg("running qgroup multiple refs test"); /* * We have BTRFS_FS_TREE_OBJECTID created already from the @@ -457,13 +457,13 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) { - test_err("couldn't allocate dummy fs info"); + test_std_err(TEST_ALLOC_FS_INFO); return -ENOMEM; } root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(root)) { - test_err("couldn't allocate root"); + test_std_err(TEST_ALLOC_ROOT); ret = PTR_ERR(root); goto out; } @@ -495,7 +495,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) tmp_root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(tmp_root)) { - test_err("couldn't allocate a fs root"); + test_std_err(TEST_ALLOC_ROOT); ret = PTR_ERR(tmp_root); goto out; } @@ -510,7 +510,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) tmp_root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(tmp_root)) { - test_err("couldn't allocate a fs root"); + test_std_err(TEST_ALLOC_ROOT); ret = PTR_ERR(tmp_root); goto out; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index e4e665f422fc..3f6811cdf803 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -50,14 +50,6 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) btrfs_err(transaction->fs_info, "pending csums is %llu", transaction->delayed_refs.pending_csums); - while (!list_empty(&transaction->pending_chunks)) { - struct extent_map *em; - - em = list_first_entry(&transaction->pending_chunks, - struct extent_map, list); - list_del_init(&em->list); - free_extent_map(em); - } /* * If any block groups are found in ->deleted_bgs then it's * because the transaction was aborted and a commit did not @@ -75,39 +67,11 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) btrfs_put_block_group_trimming(cache); btrfs_put_block_group(cache); } + WARN_ON(!list_empty(&transaction->dev_update_list)); kfree(transaction); } } -static void clear_btree_io_tree(struct extent_io_tree *tree) -{ - spin_lock(&tree->lock); - /* - * Do a single barrier for the waitqueue_active check here, the state - * of the waitqueue should not change once clear_btree_io_tree is - * called. - */ - smp_mb(); - while (!RB_EMPTY_ROOT(&tree->state)) { - struct rb_node *node; - struct extent_state *state; - - node = rb_first(&tree->state); - state = rb_entry(node, struct extent_state, rb_node); - rb_erase(&state->rb_node, &tree->state); - RB_CLEAR_NODE(&state->rb_node); - /* - * btree io trees aren't supposed to have tasks waiting for - * changes in the flags of extent states ever. - */ - ASSERT(!waitqueue_active(&state->wq)); - free_extent_state(state); - - cond_resched_lock(&tree->lock); - } - spin_unlock(&tree->lock); -} - static noinline void switch_commit_roots(struct btrfs_transaction *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -121,7 +85,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans) root->commit_root = btrfs_root_node(root); if (is_fstree(root->root_key.objectid)) btrfs_unpin_free_ino(root); - clear_btree_io_tree(&root->dirty_log_pages); + extent_io_tree_release(&root->dirty_log_pages); btrfs_qgroup_clean_swapped_blocks(root); } @@ -263,19 +227,18 @@ loop: spin_lock_init(&cur_trans->delayed_refs.lock); INIT_LIST_HEAD(&cur_trans->pending_snapshots); - INIT_LIST_HEAD(&cur_trans->pending_chunks); + INIT_LIST_HEAD(&cur_trans->dev_update_list); INIT_LIST_HEAD(&cur_trans->switch_commits); INIT_LIST_HEAD(&cur_trans->dirty_bgs); INIT_LIST_HEAD(&cur_trans->io_bgs); INIT_LIST_HEAD(&cur_trans->dropped_roots); mutex_init(&cur_trans->cache_write_mutex); - cur_trans->num_dirty_bgs = 0; spin_lock_init(&cur_trans->dirty_bgs_lock); INIT_LIST_HEAD(&cur_trans->deleted_bgs); spin_lock_init(&cur_trans->dropped_roots_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); - extent_io_tree_init(&cur_trans->dirty_pages, - fs_info->btree_inode); + extent_io_tree_init(fs_info, &cur_trans->dirty_pages, + IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode); fs_info->generation++; cur_trans->transid = fs_info->generation; fs_info->running_transaction = cur_trans; @@ -928,7 +891,7 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, * superblock that points to btree nodes/leafs for which * writeback hasn't finished yet (and without errors). * We cleanup any entries left in the io tree when committing - * the transaction (through clear_btree_io_tree()). + * the transaction (through extent_io_tree_release()). */ if (err == -ENOMEM) { err = 0; @@ -973,7 +936,7 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, * left in the io tree. For a log commit, we don't remove them * after committing the log because the tree can be accessed * concurrently - we do it only at transaction commit time when - * it's safe to do it (through clear_btree_io_tree()). + * it's safe to do it (through extent_io_tree_release()). */ err = clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, 0, 0, &cached_state); @@ -1051,7 +1014,7 @@ static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans) blk_finish_plug(&plug); ret2 = btrfs_wait_extents(fs_info, dirty_pages); - clear_btree_io_tree(&trans->transaction->dirty_pages); + extent_io_tree_release(&trans->transaction->dirty_pages); if (ret) return ret; @@ -1130,17 +1093,17 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) if (ret) return ret; - ret = btrfs_run_dev_stats(trans, fs_info); + ret = btrfs_run_dev_stats(trans); if (ret) return ret; - ret = btrfs_run_dev_replace(trans, fs_info); + ret = btrfs_run_dev_replace(trans); if (ret) return ret; ret = btrfs_run_qgroups(trans); if (ret) return ret; - ret = btrfs_setup_space_cache(trans, fs_info); + ret = btrfs_setup_space_cache(trans); if (ret) return ret; @@ -1168,7 +1131,7 @@ again: } while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) { - ret = btrfs_write_dirty_block_groups(trans, fs_info); + ret = btrfs_write_dirty_block_groups(trans); if (ret) return ret; ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); @@ -2241,8 +2204,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) memcpy(fs_info->super_for_commit, fs_info->super_copy, sizeof(*fs_info->super_copy)); - btrfs_update_commit_device_size(fs_info); - btrfs_update_commit_device_bytes_used(cur_trans); + btrfs_commit_device_sizes(cur_trans); clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags); clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index f1ba78949d1b..78c446c222b7 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -51,7 +51,7 @@ struct btrfs_transaction { wait_queue_head_t writer_wait; wait_queue_head_t commit_wait; struct list_head pending_snapshots; - struct list_head pending_chunks; + struct list_head dev_update_list; struct list_head switch_commits; struct list_head dirty_bgs; @@ -80,7 +80,6 @@ struct btrfs_transaction { */ struct mutex cache_write_mutex; spinlock_t dirty_bgs_lock; - unsigned int num_dirty_bgs; /* Protected by spin lock fs_info->unused_bgs_lock. */ struct list_head deleted_bgs; spinlock_t dropped_roots_lock; @@ -120,7 +119,6 @@ struct btrfs_trans_handle { bool allocating_chunk; bool can_flush_pending_bgs; bool reloc_reserved; - bool sync; bool dirty; struct btrfs_root *root; struct btrfs_fs_info *fs_info; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index a62e1e837a89..748cd1598255 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -15,6 +15,9 @@ * carefully reviewed otherwise so it does not prevent mount of valid images. */ +#include <linux/types.h> +#include <linux/stddef.h> +#include <linux/error-injection.h> #include "ctree.h" #include "tree-checker.h" #include "disk-io.h" @@ -41,12 +44,12 @@ * Append generic "corrupt leaf/node root=%llu block=%llu slot=%d: " to @fmt. * Allows callers to customize the output. */ -__printf(4, 5) +__printf(3, 4) __cold -static void generic_err(const struct btrfs_fs_info *fs_info, - const struct extent_buffer *eb, int slot, +static void generic_err(const struct extent_buffer *eb, int slot, const char *fmt, ...) { + const struct btrfs_fs_info *fs_info = eb->fs_info; struct va_format vaf; va_list args; @@ -66,12 +69,12 @@ static void generic_err(const struct btrfs_fs_info *fs_info, * Customized reporter for extent data item, since its key objectid and * offset has its own meaning. */ -__printf(4, 5) +__printf(3, 4) __cold -static void file_extent_err(const struct btrfs_fs_info *fs_info, - const struct extent_buffer *eb, int slot, +static void file_extent_err(const struct extent_buffer *eb, int slot, const char *fmt, ...) { + const struct btrfs_fs_info *fs_info = eb->fs_info; struct btrfs_key key; struct va_format vaf; va_list args; @@ -94,26 +97,26 @@ static void file_extent_err(const struct btrfs_fs_info *fs_info, * Return 0 if the btrfs_file_extent_##name is aligned to @alignment * Else return 1 */ -#define CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, name, alignment) \ +#define CHECK_FE_ALIGNED(leaf, slot, fi, name, alignment) \ ({ \ if (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))) \ - file_extent_err((fs_info), (leaf), (slot), \ + file_extent_err((leaf), (slot), \ "invalid %s for file extent, have %llu, should be aligned to %u", \ (#name), btrfs_file_extent_##name((leaf), (fi)), \ (alignment)); \ (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))); \ }) -static int check_extent_data_item(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf, +static int check_extent_data_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { + struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_file_extent_item *fi; u32 sectorsize = fs_info->sectorsize; u32 item_size = btrfs_item_size_nr(leaf, slot); if (!IS_ALIGNED(key->offset, sectorsize)) { - file_extent_err(fs_info, leaf, slot, + file_extent_err(leaf, slot, "unaligned file_offset for file extent, have %llu should be aligned to %u", key->offset, sectorsize); return -EUCLEAN; @@ -122,7 +125,7 @@ static int check_extent_data_item(struct btrfs_fs_info *fs_info, fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); if (btrfs_file_extent_type(leaf, fi) > BTRFS_FILE_EXTENT_TYPES) { - file_extent_err(fs_info, leaf, slot, + file_extent_err(leaf, slot, "invalid type for file extent, have %u expect range [0, %u]", btrfs_file_extent_type(leaf, fi), BTRFS_FILE_EXTENT_TYPES); @@ -134,14 +137,14 @@ static int check_extent_data_item(struct btrfs_fs_info *fs_info, * and must be caught in open_ctree(). */ if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) { - file_extent_err(fs_info, leaf, slot, + file_extent_err(leaf, slot, "invalid compression for file extent, have %u expect range [0, %u]", btrfs_file_extent_compression(leaf, fi), BTRFS_COMPRESS_TYPES); return -EUCLEAN; } if (btrfs_file_extent_encryption(leaf, fi)) { - file_extent_err(fs_info, leaf, slot, + file_extent_err(leaf, slot, "invalid encryption for file extent, have %u expect 0", btrfs_file_extent_encryption(leaf, fi)); return -EUCLEAN; @@ -149,7 +152,7 @@ static int check_extent_data_item(struct btrfs_fs_info *fs_info, if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) { /* Inline extent must have 0 as key offset */ if (key->offset) { - file_extent_err(fs_info, leaf, slot, + file_extent_err(leaf, slot, "invalid file_offset for inline file extent, have %llu expect 0", key->offset); return -EUCLEAN; @@ -163,7 +166,7 @@ static int check_extent_data_item(struct btrfs_fs_info *fs_info, /* Uncompressed inline extent size must match item size */ if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START + btrfs_file_extent_ram_bytes(leaf, fi)) { - file_extent_err(fs_info, leaf, slot, + file_extent_err(leaf, slot, "invalid ram_bytes for uncompressed inline extent, have %u expect %llu", item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START + btrfs_file_extent_ram_bytes(leaf, fi)); @@ -174,41 +177,41 @@ static int check_extent_data_item(struct btrfs_fs_info *fs_info, /* Regular or preallocated extent has fixed item size */ if (item_size != sizeof(*fi)) { - file_extent_err(fs_info, leaf, slot, + file_extent_err(leaf, slot, "invalid item size for reg/prealloc file extent, have %u expect %zu", item_size, sizeof(*fi)); return -EUCLEAN; } - if (CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, ram_bytes, sectorsize) || - CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, disk_bytenr, sectorsize) || - CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, disk_num_bytes, sectorsize) || - CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, offset, sectorsize) || - CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, num_bytes, sectorsize)) + if (CHECK_FE_ALIGNED(leaf, slot, fi, ram_bytes, sectorsize) || + CHECK_FE_ALIGNED(leaf, slot, fi, disk_bytenr, sectorsize) || + CHECK_FE_ALIGNED(leaf, slot, fi, disk_num_bytes, sectorsize) || + CHECK_FE_ALIGNED(leaf, slot, fi, offset, sectorsize) || + CHECK_FE_ALIGNED(leaf, slot, fi, num_bytes, sectorsize)) return -EUCLEAN; return 0; } -static int check_csum_item(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf, struct btrfs_key *key, +static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { + struct btrfs_fs_info *fs_info = leaf->fs_info; u32 sectorsize = fs_info->sectorsize; u32 csumsize = btrfs_super_csum_size(fs_info->super_copy); if (key->objectid != BTRFS_EXTENT_CSUM_OBJECTID) { - generic_err(fs_info, leaf, slot, + generic_err(leaf, slot, "invalid key objectid for csum item, have %llu expect %llu", key->objectid, BTRFS_EXTENT_CSUM_OBJECTID); return -EUCLEAN; } if (!IS_ALIGNED(key->offset, sectorsize)) { - generic_err(fs_info, leaf, slot, + generic_err(leaf, slot, "unaligned key offset for csum item, have %llu should be aligned to %u", key->offset, sectorsize); return -EUCLEAN; } if (!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize)) { - generic_err(fs_info, leaf, slot, + generic_err(leaf, slot, "unaligned item size for csum item, have %u should be aligned to %u", btrfs_item_size_nr(leaf, slot), csumsize); return -EUCLEAN; @@ -220,12 +223,12 @@ static int check_csum_item(struct btrfs_fs_info *fs_info, * Customized reported for dir_item, only important new info is key->objectid, * which represents inode number */ -__printf(4, 5) +__printf(3, 4) __cold -static void dir_item_err(const struct btrfs_fs_info *fs_info, - const struct extent_buffer *eb, int slot, +static void dir_item_err(const struct extent_buffer *eb, int slot, const char *fmt, ...) { + const struct btrfs_fs_info *fs_info = eb->fs_info; struct btrfs_key key; struct va_format vaf; va_list args; @@ -244,10 +247,10 @@ static void dir_item_err(const struct btrfs_fs_info *fs_info, va_end(args); } -static int check_dir_item(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf, +static int check_dir_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { + struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_dir_item *di; u32 item_size = btrfs_item_size_nr(leaf, slot); u32 cur = 0; @@ -263,7 +266,7 @@ static int check_dir_item(struct btrfs_fs_info *fs_info, /* header itself should not cross item boundary */ if (cur + sizeof(*di) > item_size) { - dir_item_err(fs_info, leaf, slot, + dir_item_err(leaf, slot, "dir item header crosses item boundary, have %zu boundary %u", cur + sizeof(*di), item_size); return -EUCLEAN; @@ -272,7 +275,7 @@ static int check_dir_item(struct btrfs_fs_info *fs_info, /* dir type check */ dir_type = btrfs_dir_type(leaf, di); if (dir_type >= BTRFS_FT_MAX) { - dir_item_err(fs_info, leaf, slot, + dir_item_err(leaf, slot, "invalid dir item type, have %u expect [0, %u)", dir_type, BTRFS_FT_MAX); return -EUCLEAN; @@ -280,14 +283,14 @@ static int check_dir_item(struct btrfs_fs_info *fs_info, if (key->type == BTRFS_XATTR_ITEM_KEY && dir_type != BTRFS_FT_XATTR) { - dir_item_err(fs_info, leaf, slot, + dir_item_err(leaf, slot, "invalid dir item type for XATTR key, have %u expect %u", dir_type, BTRFS_FT_XATTR); return -EUCLEAN; } if (dir_type == BTRFS_FT_XATTR && key->type != BTRFS_XATTR_ITEM_KEY) { - dir_item_err(fs_info, leaf, slot, + dir_item_err(leaf, slot, "xattr dir type found for non-XATTR key"); return -EUCLEAN; } @@ -300,13 +303,13 @@ static int check_dir_item(struct btrfs_fs_info *fs_info, name_len = btrfs_dir_name_len(leaf, di); data_len = btrfs_dir_data_len(leaf, di); if (name_len > max_name_len) { - dir_item_err(fs_info, leaf, slot, + dir_item_err(leaf, slot, "dir item name len too long, have %u max %u", name_len, max_name_len); return -EUCLEAN; } if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(fs_info)) { - dir_item_err(fs_info, leaf, slot, + dir_item_err(leaf, slot, "dir item name and data len too long, have %u max %u", name_len + data_len, BTRFS_MAX_XATTR_SIZE(fs_info)); @@ -314,7 +317,7 @@ static int check_dir_item(struct btrfs_fs_info *fs_info, } if (data_len && dir_type != BTRFS_FT_XATTR) { - dir_item_err(fs_info, leaf, slot, + dir_item_err(leaf, slot, "dir item with invalid data len, have %u expect 0", data_len); return -EUCLEAN; @@ -324,7 +327,7 @@ static int check_dir_item(struct btrfs_fs_info *fs_info, /* header and name/data should not cross item boundary */ if (cur + total_size > item_size) { - dir_item_err(fs_info, leaf, slot, + dir_item_err(leaf, slot, "dir item data crosses item boundary, have %u boundary %u", cur + total_size, item_size); return -EUCLEAN; @@ -342,7 +345,7 @@ static int check_dir_item(struct btrfs_fs_info *fs_info, (unsigned long)(di + 1), name_len); name_hash = btrfs_name_hash(namebuf, name_len); if (key->offset != name_hash) { - dir_item_err(fs_info, leaf, slot, + dir_item_err(leaf, slot, "name hash mismatch with key, have 0x%016x expect 0x%016llx", name_hash, key->offset); return -EUCLEAN; @@ -354,12 +357,12 @@ static int check_dir_item(struct btrfs_fs_info *fs_info, return 0; } -__printf(4, 5) +__printf(3, 4) __cold -static void block_group_err(const struct btrfs_fs_info *fs_info, - const struct extent_buffer *eb, int slot, +static void block_group_err(const struct extent_buffer *eb, int slot, const char *fmt, ...) { + const struct btrfs_fs_info *fs_info = eb->fs_info; struct btrfs_key key; struct va_format vaf; va_list args; @@ -378,8 +381,7 @@ static void block_group_err(const struct btrfs_fs_info *fs_info, va_end(args); } -static int check_block_group_item(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf, +static int check_block_group_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { struct btrfs_block_group_item bgi; @@ -392,13 +394,13 @@ static int check_block_group_item(struct btrfs_fs_info *fs_info, * handle it. We care more about the size. */ if (key->offset == 0) { - block_group_err(fs_info, leaf, slot, + block_group_err(leaf, slot, "invalid block group size 0"); return -EUCLEAN; } if (item_size != sizeof(bgi)) { - block_group_err(fs_info, leaf, slot, + block_group_err(leaf, slot, "invalid item size, have %u expect %zu", item_size, sizeof(bgi)); return -EUCLEAN; @@ -408,7 +410,7 @@ static int check_block_group_item(struct btrfs_fs_info *fs_info, sizeof(bgi)); if (btrfs_block_group_chunk_objectid(&bgi) != BTRFS_FIRST_CHUNK_TREE_OBJECTID) { - block_group_err(fs_info, leaf, slot, + block_group_err(leaf, slot, "invalid block group chunk objectid, have %llu expect %llu", btrfs_block_group_chunk_objectid(&bgi), BTRFS_FIRST_CHUNK_TREE_OBJECTID); @@ -416,7 +418,7 @@ static int check_block_group_item(struct btrfs_fs_info *fs_info, } if (btrfs_block_group_used(&bgi) > key->offset) { - block_group_err(fs_info, leaf, slot, + block_group_err(leaf, slot, "invalid block group used, have %llu expect [0, %llu)", btrfs_block_group_used(&bgi), key->offset); return -EUCLEAN; @@ -424,7 +426,7 @@ static int check_block_group_item(struct btrfs_fs_info *fs_info, flags = btrfs_block_group_flags(&bgi); if (hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) > 1) { - block_group_err(fs_info, leaf, slot, + block_group_err(leaf, slot, "invalid profile flags, have 0x%llx (%lu bits set) expect no more than 1 bit set", flags & BTRFS_BLOCK_GROUP_PROFILE_MASK, hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)); @@ -437,7 +439,7 @@ static int check_block_group_item(struct btrfs_fs_info *fs_info, type != BTRFS_BLOCK_GROUP_SYSTEM && type != (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA)) { - block_group_err(fs_info, leaf, slot, + block_group_err(leaf, slot, "invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx", type, hweight64(type), BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA, @@ -448,37 +450,367 @@ static int check_block_group_item(struct btrfs_fs_info *fs_info, return 0; } +__printf(4, 5) +__cold +static void chunk_err(const struct extent_buffer *leaf, + const struct btrfs_chunk *chunk, u64 logical, + const char *fmt, ...) +{ + const struct btrfs_fs_info *fs_info = leaf->fs_info; + bool is_sb; + struct va_format vaf; + va_list args; + int i; + int slot = -1; + + /* Only superblock eb is able to have such small offset */ + is_sb = (leaf->start == BTRFS_SUPER_INFO_OFFSET); + + if (!is_sb) { + /* + * Get the slot number by iterating through all slots, this + * would provide better readability. + */ + for (i = 0; i < btrfs_header_nritems(leaf); i++) { + if (btrfs_item_ptr_offset(leaf, i) == + (unsigned long)chunk) { + slot = i; + break; + } + } + } + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + if (is_sb) + btrfs_crit(fs_info, + "corrupt superblock syschunk array: chunk_start=%llu, %pV", + logical, &vaf); + else + btrfs_crit(fs_info, + "corrupt leaf: root=%llu block=%llu slot=%d chunk_start=%llu, %pV", + BTRFS_CHUNK_TREE_OBJECTID, leaf->start, slot, + logical, &vaf); + va_end(args); +} + +/* + * The common chunk check which could also work on super block sys chunk array. + * + * Return -EUCLEAN if anything is corrupted. + * Return 0 if everything is OK. + */ +int btrfs_check_chunk_valid(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, u64 logical) +{ + struct btrfs_fs_info *fs_info = leaf->fs_info; + u64 length; + u64 stripe_len; + u16 num_stripes; + u16 sub_stripes; + u64 type; + u64 features; + bool mixed = false; + + length = btrfs_chunk_length(leaf, chunk); + stripe_len = btrfs_chunk_stripe_len(leaf, chunk); + num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); + type = btrfs_chunk_type(leaf, chunk); + + if (!num_stripes) { + chunk_err(leaf, chunk, logical, + "invalid chunk num_stripes, have %u", num_stripes); + return -EUCLEAN; + } + if (!IS_ALIGNED(logical, fs_info->sectorsize)) { + chunk_err(leaf, chunk, logical, + "invalid chunk logical, have %llu should aligned to %u", + logical, fs_info->sectorsize); + return -EUCLEAN; + } + if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) { + chunk_err(leaf, chunk, logical, + "invalid chunk sectorsize, have %u expect %u", + btrfs_chunk_sector_size(leaf, chunk), + fs_info->sectorsize); + return -EUCLEAN; + } + if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) { + chunk_err(leaf, chunk, logical, + "invalid chunk length, have %llu", length); + return -EUCLEAN; + } + if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) { + chunk_err(leaf, chunk, logical, + "invalid chunk stripe length: %llu", + stripe_len); + return -EUCLEAN; + } + if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & + type) { + chunk_err(leaf, chunk, logical, + "unrecognized chunk type: 0x%llx", + ~(BTRFS_BLOCK_GROUP_TYPE_MASK | + BTRFS_BLOCK_GROUP_PROFILE_MASK) & + btrfs_chunk_type(leaf, chunk)); + return -EUCLEAN; + } + + if (!is_power_of_2(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) && + (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0) { + chunk_err(leaf, chunk, logical, + "invalid chunk profile flag: 0x%llx, expect 0 or 1 bit set", + type & BTRFS_BLOCK_GROUP_PROFILE_MASK); + return -EUCLEAN; + } + if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) { + chunk_err(leaf, chunk, logical, + "missing chunk type flag, have 0x%llx one bit must be set in 0x%llx", + type, BTRFS_BLOCK_GROUP_TYPE_MASK); + return -EUCLEAN; + } + + if ((type & BTRFS_BLOCK_GROUP_SYSTEM) && + (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) { + chunk_err(leaf, chunk, logical, + "system chunk with data or metadata type: 0x%llx", + type); + return -EUCLEAN; + } + + features = btrfs_super_incompat_flags(fs_info->super_copy); + if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) + mixed = true; + + if (!mixed) { + if ((type & BTRFS_BLOCK_GROUP_METADATA) && + (type & BTRFS_BLOCK_GROUP_DATA)) { + chunk_err(leaf, chunk, logical, + "mixed chunk type in non-mixed mode: 0x%llx", type); + return -EUCLEAN; + } + } + + if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || + (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) || + (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || + (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || + (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) || + ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && num_stripes != 1)) { + chunk_err(leaf, chunk, logical, + "invalid num_stripes:sub_stripes %u:%u for profile %llu", + num_stripes, sub_stripes, + type & BTRFS_BLOCK_GROUP_PROFILE_MASK); + return -EUCLEAN; + } + + return 0; +} + +__printf(3, 4) +__cold +static void dev_item_err(const struct extent_buffer *eb, int slot, + const char *fmt, ...) +{ + struct btrfs_key key; + struct va_format vaf; + va_list args; + + btrfs_item_key_to_cpu(eb, &key, slot); + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + btrfs_crit(eb->fs_info, + "corrupt %s: root=%llu block=%llu slot=%d devid=%llu %pV", + btrfs_header_level(eb) == 0 ? "leaf" : "node", + btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, + key.objectid, &vaf); + va_end(args); +} + +static int check_dev_item(struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + struct btrfs_fs_info *fs_info = leaf->fs_info; + struct btrfs_dev_item *ditem; + u64 max_devid = max(BTRFS_MAX_DEVS(fs_info), BTRFS_MAX_DEVS_SYS_CHUNK); + + if (key->objectid != BTRFS_DEV_ITEMS_OBJECTID) { + dev_item_err(leaf, slot, + "invalid objectid: has=%llu expect=%llu", + key->objectid, BTRFS_DEV_ITEMS_OBJECTID); + return -EUCLEAN; + } + if (key->offset > max_devid) { + dev_item_err(leaf, slot, + "invalid devid: has=%llu expect=[0, %llu]", + key->offset, max_devid); + return -EUCLEAN; + } + ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item); + if (btrfs_device_id(leaf, ditem) != key->offset) { + dev_item_err(leaf, slot, + "devid mismatch: key has=%llu item has=%llu", + key->offset, btrfs_device_id(leaf, ditem)); + return -EUCLEAN; + } + + /* + * For device total_bytes, we don't have reliable way to check it, as + * it can be 0 for device removal. Device size check can only be done + * by dev extents check. + */ + if (btrfs_device_bytes_used(leaf, ditem) > + btrfs_device_total_bytes(leaf, ditem)) { + dev_item_err(leaf, slot, + "invalid bytes used: have %llu expect [0, %llu]", + btrfs_device_bytes_used(leaf, ditem), + btrfs_device_total_bytes(leaf, ditem)); + return -EUCLEAN; + } + /* + * Remaining members like io_align/type/gen/dev_group aren't really + * utilized. Skip them to make later usage of them easier. + */ + return 0; +} + +/* Inode item error output has the same format as dir_item_err() */ +#define inode_item_err(fs_info, eb, slot, fmt, ...) \ + dir_item_err(eb, slot, fmt, __VA_ARGS__) + +static int check_inode_item(struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + struct btrfs_fs_info *fs_info = leaf->fs_info; + struct btrfs_inode_item *iitem; + u64 super_gen = btrfs_super_generation(fs_info->super_copy); + u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777); + u32 mode; + + if ((key->objectid < BTRFS_FIRST_FREE_OBJECTID || + key->objectid > BTRFS_LAST_FREE_OBJECTID) && + key->objectid != BTRFS_ROOT_TREE_DIR_OBJECTID && + key->objectid != BTRFS_FREE_INO_OBJECTID) { + generic_err(leaf, slot, + "invalid key objectid: has %llu expect %llu or [%llu, %llu] or %llu", + key->objectid, BTRFS_ROOT_TREE_DIR_OBJECTID, + BTRFS_FIRST_FREE_OBJECTID, + BTRFS_LAST_FREE_OBJECTID, + BTRFS_FREE_INO_OBJECTID); + return -EUCLEAN; + } + if (key->offset != 0) { + inode_item_err(fs_info, leaf, slot, + "invalid key offset: has %llu expect 0", + key->offset); + return -EUCLEAN; + } + iitem = btrfs_item_ptr(leaf, slot, struct btrfs_inode_item); + + /* Here we use super block generation + 1 to handle log tree */ + if (btrfs_inode_generation(leaf, iitem) > super_gen + 1) { + inode_item_err(fs_info, leaf, slot, + "invalid inode generation: has %llu expect (0, %llu]", + btrfs_inode_generation(leaf, iitem), + super_gen + 1); + return -EUCLEAN; + } + /* Note for ROOT_TREE_DIR_ITEM, mkfs could set its transid 0 */ + if (btrfs_inode_transid(leaf, iitem) > super_gen + 1) { + inode_item_err(fs_info, leaf, slot, + "invalid inode generation: has %llu expect [0, %llu]", + btrfs_inode_transid(leaf, iitem), super_gen + 1); + return -EUCLEAN; + } + + /* + * For size and nbytes it's better not to be too strict, as for dir + * item its size/nbytes can easily get wrong, but doesn't affect + * anything in the fs. So here we skip the check. + */ + mode = btrfs_inode_mode(leaf, iitem); + if (mode & ~valid_mask) { + inode_item_err(fs_info, leaf, slot, + "unknown mode bit detected: 0x%x", + mode & ~valid_mask); + return -EUCLEAN; + } + + /* + * S_IFMT is not bit mapped so we can't completely rely on is_power_of_2, + * but is_power_of_2() can save us from checking FIFO/CHR/DIR/REG. + * Only needs to check BLK, LNK and SOCKS + */ + if (!is_power_of_2(mode & S_IFMT)) { + if (!S_ISLNK(mode) && !S_ISBLK(mode) && !S_ISSOCK(mode)) { + inode_item_err(fs_info, leaf, slot, + "invalid mode: has 0%o expect valid S_IF* bit(s)", + mode & S_IFMT); + return -EUCLEAN; + } + } + if (S_ISDIR(mode) && btrfs_inode_nlink(leaf, iitem) > 1) { + inode_item_err(fs_info, leaf, slot, + "invalid nlink: has %u expect no more than 1 for dir", + btrfs_inode_nlink(leaf, iitem)); + return -EUCLEAN; + } + if (btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK) { + inode_item_err(fs_info, leaf, slot, + "unknown flags detected: 0x%llx", + btrfs_inode_flags(leaf, iitem) & + ~BTRFS_INODE_FLAG_MASK); + return -EUCLEAN; + } + return 0; +} + /* * Common point to switch the item-specific validation. */ -static int check_leaf_item(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf, +static int check_leaf_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { int ret = 0; + struct btrfs_chunk *chunk; switch (key->type) { case BTRFS_EXTENT_DATA_KEY: - ret = check_extent_data_item(fs_info, leaf, key, slot); + ret = check_extent_data_item(leaf, key, slot); break; case BTRFS_EXTENT_CSUM_KEY: - ret = check_csum_item(fs_info, leaf, key, slot); + ret = check_csum_item(leaf, key, slot); break; case BTRFS_DIR_ITEM_KEY: case BTRFS_DIR_INDEX_KEY: case BTRFS_XATTR_ITEM_KEY: - ret = check_dir_item(fs_info, leaf, key, slot); + ret = check_dir_item(leaf, key, slot); break; case BTRFS_BLOCK_GROUP_ITEM_KEY: - ret = check_block_group_item(fs_info, leaf, key, slot); + ret = check_block_group_item(leaf, key, slot); + break; + case BTRFS_CHUNK_ITEM_KEY: + chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); + ret = btrfs_check_chunk_valid(leaf, chunk, key->offset); + break; + case BTRFS_DEV_ITEM_KEY: + ret = check_dev_item(leaf, key, slot); + break; + case BTRFS_INODE_ITEM_KEY: + ret = check_inode_item(leaf, key, slot); break; } return ret; } -static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, - bool check_item_data) +static int check_leaf(struct extent_buffer *leaf, bool check_item_data) { + struct btrfs_fs_info *fs_info = leaf->fs_info; /* No valid key type is 0, so all key should be larger than this key */ struct btrfs_key prev_key = {0, 0, 0}; struct btrfs_key key; @@ -486,7 +818,7 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, int slot; if (btrfs_header_level(leaf) != 0) { - generic_err(fs_info, leaf, 0, + generic_err(leaf, 0, "invalid level for leaf, have %d expect 0", btrfs_header_level(leaf)); return -EUCLEAN; @@ -502,7 +834,6 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, */ if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) { u64 owner = btrfs_header_owner(leaf); - struct btrfs_root *check_root; /* These trees must never be empty */ if (owner == BTRFS_ROOT_TREE_OBJECTID || @@ -511,34 +842,11 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, owner == BTRFS_DEV_TREE_OBJECTID || owner == BTRFS_FS_TREE_OBJECTID || owner == BTRFS_DATA_RELOC_TREE_OBJECTID) { - generic_err(fs_info, leaf, 0, + generic_err(leaf, 0, "invalid root, root %llu must never be empty", owner); return -EUCLEAN; } - key.objectid = owner; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - - check_root = btrfs_get_fs_root(fs_info, &key, false); - /* - * The only reason we also check NULL here is that during - * open_ctree() some roots has not yet been set up. - */ - if (!IS_ERR_OR_NULL(check_root)) { - struct extent_buffer *eb; - - eb = btrfs_root_node(check_root); - /* if leaf is the root, then it's fine */ - if (leaf != eb) { - generic_err(fs_info, leaf, 0, - "invalid nritems, have %u should not be 0 for non-root leaf", - nritems); - free_extent_buffer(eb); - return -EUCLEAN; - } - free_extent_buffer(eb); - } return 0; } @@ -564,7 +872,7 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, /* Make sure the keys are in the right order */ if (btrfs_comp_cpu_keys(&prev_key, &key) >= 0) { - generic_err(fs_info, leaf, slot, + generic_err(leaf, slot, "bad key order, prev (%llu %u %llu) current (%llu %u %llu)", prev_key.objectid, prev_key.type, prev_key.offset, key.objectid, key.type, @@ -583,7 +891,7 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, item_end_expected = btrfs_item_offset_nr(leaf, slot - 1); if (btrfs_item_end_nr(leaf, slot) != item_end_expected) { - generic_err(fs_info, leaf, slot, + generic_err(leaf, slot, "unexpected item end, have %u expect %u", btrfs_item_end_nr(leaf, slot), item_end_expected); @@ -597,7 +905,7 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, */ if (btrfs_item_end_nr(leaf, slot) > BTRFS_LEAF_DATA_SIZE(fs_info)) { - generic_err(fs_info, leaf, slot, + generic_err(leaf, slot, "slot end outside of leaf, have %u expect range [0, %u]", btrfs_item_end_nr(leaf, slot), BTRFS_LEAF_DATA_SIZE(fs_info)); @@ -607,7 +915,7 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, /* Also check if the item pointer overlaps with btrfs item. */ if (btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item) > btrfs_item_ptr_offset(leaf, slot)) { - generic_err(fs_info, leaf, slot, + generic_err(leaf, slot, "slot overlaps with its data, item end %lu data start %lu", btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item), @@ -620,7 +928,7 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, * Check if the item size and content meet other * criteria */ - ret = check_leaf_item(fs_info, leaf, &key, slot); + ret = check_leaf_item(leaf, &key, slot); if (ret < 0) return ret; } @@ -633,20 +941,20 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, return 0; } -int btrfs_check_leaf_full(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf) +int btrfs_check_leaf_full(struct extent_buffer *leaf) { - return check_leaf(fs_info, leaf, true); + return check_leaf(leaf, true); } +ALLOW_ERROR_INJECTION(btrfs_check_leaf_full, ERRNO); -int btrfs_check_leaf_relaxed(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf) +int btrfs_check_leaf_relaxed(struct extent_buffer *leaf) { - return check_leaf(fs_info, leaf, false); + return check_leaf(leaf, false); } -int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node) +int btrfs_check_node(struct extent_buffer *node) { + struct btrfs_fs_info *fs_info = node->fs_info; unsigned long nr = btrfs_header_nritems(node); struct btrfs_key key, next_key; int slot; @@ -655,7 +963,7 @@ int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node) int ret = 0; if (level <= 0 || level >= BTRFS_MAX_LEVEL) { - generic_err(fs_info, node, 0, + generic_err(node, 0, "invalid level for node, have %d expect [1, %d]", level, BTRFS_MAX_LEVEL - 1); return -EUCLEAN; @@ -675,13 +983,13 @@ int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node) btrfs_node_key_to_cpu(node, &next_key, slot + 1); if (!bytenr) { - generic_err(fs_info, node, slot, + generic_err(node, slot, "invalid NULL node pointer"); ret = -EUCLEAN; goto out; } if (!IS_ALIGNED(bytenr, fs_info->sectorsize)) { - generic_err(fs_info, node, slot, + generic_err(node, slot, "unaligned pointer, have %llu should be aligned to %u", bytenr, fs_info->sectorsize); ret = -EUCLEAN; @@ -689,7 +997,7 @@ int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node) } if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) { - generic_err(fs_info, node, slot, + generic_err(node, slot, "bad key order, current (%llu %u %llu) next (%llu %u %llu)", key.objectid, key.type, key.offset, next_key.objectid, next_key.type, @@ -701,3 +1009,4 @@ int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node) out: return ret; } +ALLOW_ERROR_INJECTION(btrfs_check_node, ERRNO); diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index ff043275b784..32fecc9dc1dd 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -14,15 +14,16 @@ * Will check not only the item pointers, but also every possible member * in item data. */ -int btrfs_check_leaf_full(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf); +int btrfs_check_leaf_full(struct extent_buffer *leaf); /* * Less strict leaf checker. * Will only check item pointers, not reading item data. */ -int btrfs_check_leaf_relaxed(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf); -int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node); +int btrfs_check_leaf_relaxed(struct extent_buffer *leaf); +int btrfs_check_node(struct extent_buffer *node); + +int btrfs_check_chunk_valid(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, u64 logical); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 561884f60d35..6adcd8a2c5c7 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -139,7 +139,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans, mutex_lock(&root->log_mutex); if (root->log_root) { - if (btrfs_need_log_full_commit(fs_info, trans)) { + if (btrfs_need_log_full_commit(trans)) { ret = -EAGAIN; goto out; } @@ -225,6 +225,17 @@ void btrfs_end_log_trans(struct btrfs_root *root) } } +static int btrfs_write_tree_block(struct extent_buffer *buf) +{ + return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start, + buf->start + buf->len - 1); +} + +static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) +{ + filemap_fdatawait_range(buf->pages[0]->mapping, + buf->start, buf->start + buf->len - 1); +} /* * the walk control struct is used to pass state down the chain when @@ -304,7 +315,7 @@ static int process_one_buffer(struct btrfs_root *log, if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { if (wc->pin && btrfs_header_level(eb) == 0) - ret = btrfs_exclude_logged_extents(fs_info, eb); + ret = btrfs_exclude_logged_extents(eb); if (wc->write) btrfs_write_tree_block(eb); if (wc->wait) @@ -333,7 +344,6 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans, struct extent_buffer *eb, int slot, struct btrfs_key *key) { - struct btrfs_fs_info *fs_info = root->fs_info; int ret; u32 item_size; u64 saved_i_size = 0; @@ -454,10 +464,9 @@ insert: found_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); if (found_size > item_size) - btrfs_truncate_item(fs_info, path, item_size, 1); + btrfs_truncate_item(path, item_size, 1); else if (found_size < item_size) - btrfs_extend_item(fs_info, path, - item_size - found_size); + btrfs_extend_item(path, item_size - found_size); } else if (ret) { return ret; } @@ -694,9 +703,11 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, goto out; if (ins.objectid > 0) { + struct btrfs_ref ref = { 0 }; u64 csum_start; u64 csum_end; LIST_HEAD(ordered_sums); + /* * is this extent already allocated in the extent * allocation tree? If so, just add a reference @@ -704,10 +715,13 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_lookup_data_extent(fs_info, ins.objectid, ins.offset); if (ret == 0) { - ret = btrfs_inc_extent_ref(trans, root, - ins.objectid, ins.offset, - 0, root->root_key.objectid, + btrfs_init_generic_ref(&ref, + BTRFS_ADD_DELAYED_REF, + ins.objectid, ins.offset, 0); + btrfs_init_data_ref(&ref, + root->root_key.objectid, key->objectid, offset); + ret = btrfs_inc_extent_ref(trans, &ref); if (ret) goto out; } else { @@ -2725,7 +2739,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); btrfs_set_lock_blocking_write(next); - clean_tree_block(fs_info, next); + btrfs_clean_tree_block(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); } else { @@ -2809,7 +2823,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); btrfs_set_lock_blocking_write(next); - clean_tree_block(fs_info, next); + btrfs_clean_tree_block(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); } else { @@ -2891,7 +2905,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); btrfs_set_lock_blocking_write(next); - clean_tree_block(fs_info, next); + btrfs_clean_tree_block(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); } else { @@ -3066,7 +3080,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } /* bail out if we need to do a full commit */ - if (btrfs_need_log_full_commit(fs_info, trans)) { + if (btrfs_need_log_full_commit(trans)) { ret = -EAGAIN; mutex_unlock(&root->log_mutex); goto out; @@ -3085,7 +3099,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (ret) { blk_finish_plug(&plug); btrfs_abort_transaction(trans, ret); - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); mutex_unlock(&root->log_mutex); goto out; } @@ -3127,7 +3141,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, list_del_init(&root_log_ctx.list); blk_finish_plug(&plug); - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); if (ret != -ENOSPC) { btrfs_abort_transaction(trans, ret); @@ -3173,7 +3187,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * now that we've moved on to the tree of log tree roots, * check the full commit flag again */ - if (btrfs_need_log_full_commit(fs_info, trans)) { + if (btrfs_need_log_full_commit(trans)) { blk_finish_plug(&plug); btrfs_wait_tree_log_extents(log, mark); mutex_unlock(&log_root_tree->log_mutex); @@ -3186,7 +3200,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, EXTENT_DIRTY | EXTENT_NEW); blk_finish_plug(&plug); if (ret) { - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); btrfs_abort_transaction(trans, ret); mutex_unlock(&log_root_tree->log_mutex); goto out_wake_log_root; @@ -3196,7 +3210,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_wait_tree_log_extents(log_root_tree, EXTENT_NEW | EXTENT_DIRTY); if (ret) { - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); mutex_unlock(&log_root_tree->log_mutex); goto out_wake_log_root; } @@ -3218,7 +3232,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ ret = write_all_supers(fs_info, 1); if (ret) { - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); btrfs_abort_transaction(trans, ret); goto out_wake_log_root; } @@ -3422,7 +3436,7 @@ fail: out_unlock: mutex_unlock(&dir->log_mutex); if (ret == -ENOSPC) { - btrfs_set_log_full_commit(root->fs_info, trans); + btrfs_set_log_full_commit(trans); ret = 0; } else if (ret < 0) btrfs_abort_transaction(trans, ret); @@ -3438,7 +3452,6 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, const char *name, int name_len, struct btrfs_inode *inode, u64 dirid) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *log; u64 index; int ret; @@ -3456,7 +3469,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, dirid, &index); mutex_unlock(&inode->log_mutex); if (ret == -ENOSPC) { - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); ret = 0; } else if (ret < 0 && ret != -ENOENT) btrfs_abort_transaction(trans, ret); @@ -5442,7 +5455,7 @@ static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, * Make sure any commits to the log are forced to be full * commits. */ - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); ret = true; } mutex_unlock(&inode->log_mutex); @@ -5819,6 +5832,190 @@ out: return ret; } +static int log_new_ancestors(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_key found_key; + + btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); + + while (true) { + struct btrfs_fs_info *fs_info = root->fs_info; + const u64 last_committed = fs_info->last_trans_committed; + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + struct btrfs_key search_key; + struct inode *inode; + int ret = 0; + + btrfs_release_path(path); + + search_key.objectid = found_key.offset; + search_key.type = BTRFS_INODE_ITEM_KEY; + search_key.offset = 0; + inode = btrfs_iget(fs_info->sb, &search_key, root, NULL); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + if (BTRFS_I(inode)->generation > last_committed) + ret = btrfs_log_inode(trans, root, BTRFS_I(inode), + LOG_INODE_EXISTS, + 0, LLONG_MAX, ctx); + iput(inode); + if (ret) + return ret; + + if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID) + break; + + search_key.type = BTRFS_INODE_REF_KEY; + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + return ret; + + leaf = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + else if (ret > 0) + return -ENOENT; + leaf = path->nodes[0]; + slot = path->slots[0]; + } + + btrfs_item_key_to_cpu(leaf, &found_key, slot); + if (found_key.objectid != search_key.objectid || + found_key.type != BTRFS_INODE_REF_KEY) + return -ENOENT; + } + return 0; +} + +static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct dentry *parent, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct dentry *old_parent = NULL; + struct super_block *sb = inode->vfs_inode.i_sb; + int ret = 0; + + while (true) { + if (!parent || d_really_is_negative(parent) || + sb != parent->d_sb) + break; + + inode = BTRFS_I(d_inode(parent)); + if (root != inode->root) + break; + + if (inode->generation > fs_info->last_trans_committed) { + ret = btrfs_log_inode(trans, root, inode, + LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); + if (ret) + break; + } + if (IS_ROOT(parent)) + break; + + parent = dget_parent(parent); + dput(old_parent); + old_parent = parent; + } + dput(old_parent); + + return ret; +} + +static int log_all_new_ancestors(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct dentry *parent, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_root *root = inode->root; + const u64 ino = btrfs_ino(inode); + struct btrfs_path *path; + struct btrfs_key search_key; + int ret; + + /* + * For a single hard link case, go through a fast path that does not + * need to iterate the fs/subvolume tree. + */ + if (inode->vfs_inode.i_nlink < 2) + return log_new_ancestors_fast(trans, inode, parent, ctx); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + search_key.objectid = ino; + search_key.type = BTRFS_INODE_REF_KEY; + search_key.offset = 0; +again: + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + goto out; + if (ret == 0) + path->slots[0]++; + + while (true) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + struct btrfs_key found_key; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &found_key, slot); + if (found_key.objectid != ino || + found_key.type > BTRFS_INODE_EXTREF_KEY) + break; + + /* + * Don't deal with extended references because they are rare + * cases and too complex to deal with (we would need to keep + * track of which subitem we are processing for each item in + * this loop, etc). So just return some error to fallback to + * a transaction commit. + */ + if (found_key.type == BTRFS_INODE_EXTREF_KEY) { + ret = -EMLINK; + goto out; + } + + /* + * Logging ancestors needs to do more searches on the fs/subvol + * tree, so it releases the path as needed to avoid deadlocks. + * Keep track of the last inode ref key and resume from that key + * after logging all new ancestors for the current hard link. + */ + memcpy(&search_key, &found_key, sizeof(search_key)); + + ret = log_new_ancestors(trans, root, path, ctx); + if (ret) + goto out; + btrfs_release_path(path); + goto again; + } + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + /* * helper function around btrfs_log_inode to make sure newly created * parent directories also end up in the log. A minimal inode and backref @@ -5836,11 +6033,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct super_block *sb; - struct dentry *old_parent = NULL; int ret = 0; u64 last_committed = fs_info->last_trans_committed; bool log_dentries = false; - struct btrfs_inode *orig_inode = inode; sb = inode->vfs_inode.i_sb; @@ -5946,56 +6141,22 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, * and has a link count of 2. */ if (inode->last_unlink_trans > last_committed) { - ret = btrfs_log_all_parents(trans, orig_inode, ctx); + ret = btrfs_log_all_parents(trans, inode, ctx); if (ret) goto end_trans; } - /* - * If a new hard link was added to the inode in the current transaction - * and its link count is now greater than 1, we need to fallback to a - * transaction commit, otherwise we can end up not logging all its new - * parents for all the hard links. Here just from the dentry used to - * fsync, we can not visit the ancestor inodes for all the other hard - * links to figure out if any is new, so we fallback to a transaction - * commit (instead of adding a lot of complexity of scanning a btree, - * since this scenario is not a common use case). - */ - if (inode->vfs_inode.i_nlink > 1 && - inode->last_link_trans > last_committed) { - ret = -EMLINK; + ret = log_all_new_ancestors(trans, inode, parent, ctx); + if (ret) goto end_trans; - } - while (1) { - if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) - break; - - inode = BTRFS_I(d_inode(parent)); - if (root != inode->root) - break; - - if (inode->generation > last_committed) { - ret = btrfs_log_inode(trans, root, inode, - LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); - if (ret) - goto end_trans; - } - if (IS_ROOT(parent)) - break; - - parent = dget_parent(parent); - dput(old_parent); - old_parent = parent; - } if (log_dentries) - ret = log_new_dir_dentries(trans, root, orig_inode, ctx); + ret = log_new_dir_dentries(trans, root, inode, ctx); else ret = 0; end_trans: - dput(old_parent); if (ret < 0) { - btrfs_set_log_full_commit(fs_info, trans); + btrfs_set_log_full_commit(trans); ret = 1; } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 0fab84a8f670..132e43d29034 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -30,16 +30,14 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, INIT_LIST_HEAD(&ctx->list); } -static inline void btrfs_set_log_full_commit(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans) +static inline void btrfs_set_log_full_commit(struct btrfs_trans_handle *trans) { - WRITE_ONCE(fs_info->last_trans_log_full_commit, trans->transid); + WRITE_ONCE(trans->fs_info->last_trans_log_full_commit, trans->transid); } -static inline int btrfs_need_log_full_commit(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans) +static inline int btrfs_need_log_full_commit(struct btrfs_trans_handle *trans) { - return READ_ONCE(fs_info->last_trans_log_full_commit) == + return READ_ONCE(trans->fs_info->last_trans_log_full_commit) == trans->transid; } diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 3b2ae342e649..91caab63bdf5 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -121,12 +121,12 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, * An item with that type already exists. * Extend the item and store the new subid at the end. */ - btrfs_extend_item(fs_info, path, sizeof(subid_le)); + btrfs_extend_item(path, sizeof(subid_le)); eb = path->nodes[0]; slot = path->slots[0]; offset = btrfs_item_ptr_offset(eb, slot); offset += btrfs_item_size_nr(eb, slot) - sizeof(subid_le); - } else if (ret < 0) { + } else { btrfs_warn(fs_info, "insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!", ret, (unsigned long long)key.objectid, @@ -219,7 +219,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, move_src = offset + sizeof(subid); move_len = item_size - (move_src - btrfs_item_ptr_offset(eb, slot)); memmove_extent_buffer(eb, move_dst, move_src, move_len); - btrfs_truncate_item(fs_info, path, item_size - sizeof(subid), 1); + btrfs_truncate_item(path, item_size - sizeof(subid), 1); out: btrfs_free_path(path); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index db934ceae9c1..1c2a6e4b39da 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -27,6 +27,7 @@ #include "math.h" #include "dev-replace.h" #include "sysfs.h" +#include "tree-checker.h" const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = { @@ -184,8 +185,7 @@ void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) out_overflow:; } -static int init_first_rw_device(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); +static int init_first_rw_device(struct btrfs_trans_handle *trans); static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); static void __btrfs_reset_dev_stats(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); @@ -318,7 +318,6 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, mutex_init(&fs_devs->device_list_mutex); INIT_LIST_HEAD(&fs_devs->devices); - INIT_LIST_HEAD(&fs_devs->resized_devices); INIT_LIST_HEAD(&fs_devs->alloc_list); INIT_LIST_HEAD(&fs_devs->fs_list); if (fsid) @@ -334,7 +333,9 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, void btrfs_free_device(struct btrfs_device *device) { + WARN_ON(!list_empty(&device->post_commit_list)); rcu_string_free(device->name); + extent_io_tree_release(&device->alloc_state); bio_put(device->flush_bio); kfree(device); } @@ -402,7 +403,7 @@ static struct btrfs_device *__alloc_device(void) INIT_LIST_HEAD(&dev->dev_list); INIT_LIST_HEAD(&dev->dev_alloc_list); - INIT_LIST_HEAD(&dev->resized_list); + INIT_LIST_HEAD(&dev->post_commit_list); spin_lock_init(&dev->io_lock); @@ -411,6 +412,7 @@ static struct btrfs_device *__alloc_device(void) btrfs_device_data_ordered_init(dev); INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); + extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL); return dev; } @@ -1230,14 +1232,6 @@ again: mutex_unlock(&uuid_mutex); } -static void free_device_rcu(struct rcu_head *head) -{ - struct btrfs_device *device; - - device = container_of(head, struct btrfs_device, rcu); - btrfs_free_device(device); -} - static void btrfs_close_bdev(struct btrfs_device *device) { if (!device->bdev) @@ -1285,7 +1279,8 @@ static void btrfs_close_one_device(struct btrfs_device *device) list_replace_rcu(&device->dev_list, &new_device->dev_list); new_device->fs_devices = device->fs_devices; - call_rcu(&device->rcu, free_device_rcu); + synchronize_rcu(); + btrfs_free_device(device); } static int close_fs_devices(struct btrfs_fs_devices *fs_devices) @@ -1505,58 +1500,29 @@ error_bdev_put: return device; } -static int contains_pending_extent(struct btrfs_transaction *transaction, - struct btrfs_device *device, - u64 *start, u64 len) +/* + * Try to find a chunk that intersects [start, start + len] range and when one + * such is found, record the end of it in *start + */ +static bool contains_pending_extent(struct btrfs_device *device, u64 *start, + u64 len) { - struct btrfs_fs_info *fs_info = device->fs_info; - struct extent_map *em; - struct list_head *search_list = &fs_info->pinned_chunks; - int ret = 0; - u64 physical_start = *start; + u64 physical_start, physical_end; - if (transaction) - search_list = &transaction->pending_chunks; -again: - list_for_each_entry(em, search_list, list) { - struct map_lookup *map; - int i; + lockdep_assert_held(&device->fs_info->chunk_mutex); - map = em->map_lookup; - for (i = 0; i < map->num_stripes; i++) { - u64 end; + if (!find_first_extent_bit(&device->alloc_state, *start, + &physical_start, &physical_end, + CHUNK_ALLOCATED, NULL)) { - if (map->stripes[i].dev != device) - continue; - if (map->stripes[i].physical >= physical_start + len || - map->stripes[i].physical + em->orig_block_len <= - physical_start) - continue; - /* - * Make sure that while processing the pinned list we do - * not override our *start with a lower value, because - * we can have pinned chunks that fall within this - * device hole and that have lower physical addresses - * than the pending chunks we processed before. If we - * do not take this special care we can end up getting - * 2 pending chunks that start at the same physical - * device offsets because the end offset of a pinned - * chunk can be equal to the start offset of some - * pending chunk. - */ - end = map->stripes[i].physical + em->orig_block_len; - if (end > *start) { - *start = end; - ret = 1; - } + if (in_range(physical_start, *start, len) || + in_range(*start, physical_start, + physical_end - physical_start)) { + *start = physical_end + 1; + return true; } } - if (search_list != &fs_info->pinned_chunks) { - search_list = &fs_info->pinned_chunks; - goto again; - } - - return ret; + return false; } @@ -1581,8 +1547,7 @@ again: * But if we don't find suitable free space, it is used to store the size of * the max free space. */ -int find_free_dev_extent_start(struct btrfs_transaction *transaction, - struct btrfs_device *device, u64 num_bytes, +int find_free_dev_extent_start(struct btrfs_device *device, u64 num_bytes, u64 search_start, u64 *start, u64 *len) { struct btrfs_fs_info *fs_info = device->fs_info; @@ -1667,15 +1632,12 @@ again: * Have to check before we set max_hole_start, otherwise * we could end up sending back this offset anyway. */ - if (contains_pending_extent(transaction, device, - &search_start, + if (contains_pending_extent(device, &search_start, hole_size)) { - if (key.offset >= search_start) { + if (key.offset >= search_start) hole_size = key.offset - search_start; - } else { - WARN_ON_ONCE(1); + else hole_size = 0; - } } if (hole_size > max_hole_size) { @@ -1716,8 +1678,7 @@ next: if (search_end > search_start) { hole_size = search_end - search_start; - if (contains_pending_extent(transaction, device, &search_start, - hole_size)) { + if (contains_pending_extent(device, &search_start, hole_size)) { btrfs_release_path(path); goto again; } @@ -1742,13 +1703,11 @@ out: return ret; } -int find_free_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, u64 num_bytes, +int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *len) { /* FIXME use last free of some kind */ - return find_free_dev_extent_start(trans->transaction, device, - num_bytes, 0, start, len); + return find_free_dev_extent_start(device, num_bytes, 0, start, len); } static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, @@ -1982,10 +1941,9 @@ static void update_dev_time(const char *path_name) filp_close(filp, NULL); } -static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info, - struct btrfs_device *device) +static int btrfs_rm_dev_item(struct btrfs_device *device) { - struct btrfs_root *root = fs_info->chunk_root; + struct btrfs_root *root = device->fs_info->chunk_root; int ret; struct btrfs_path *path; struct btrfs_key key; @@ -2186,12 +2144,12 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, * counter although write_all_supers() is not locked out. This * could give a filesystem state which requires a degraded mount. */ - ret = btrfs_rm_dev_item(fs_info, device); + ret = btrfs_rm_dev_item(device); if (ret) goto error_undo; clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); - btrfs_scrub_cancel_dev(fs_info, device); + btrfs_scrub_cancel_dev(device); /* * the device list mutex makes sure that we don't change @@ -2242,7 +2200,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, btrfs_scratch_superblocks(device->bdev, device->name->str); btrfs_close_bdev(device); - call_rcu(&device->rcu, free_device_rcu); + synchronize_rcu(); + btrfs_free_device(device); if (cur_devices->open_devices == 0) { while (fs_devices) { @@ -2299,9 +2258,9 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) fs_devices->open_devices--; } -void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, - struct btrfs_device *srcdev) +void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) { + struct btrfs_fs_info *fs_info = srcdev->fs_info; struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) { @@ -2310,7 +2269,8 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, } btrfs_close_bdev(srcdev); - call_rcu(&srcdev->rcu, free_device_rcu); + synchronize_rcu(); + btrfs_free_device(srcdev); /* if this is no devs we rather delete the fs_devices */ if (!fs_devices->num_devices) { @@ -2368,7 +2328,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); btrfs_close_bdev(tgtdev); - call_rcu(&tgtdev->rcu, free_device_rcu); + synchronize_rcu(); + btrfs_free_device(tgtdev); } static struct btrfs_device *btrfs_find_device_by_path( @@ -2503,9 +2464,9 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) /* * Store the expected generation for seed devices in device items. */ -static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = fs_info->chunk_root; struct btrfs_path *path; struct extent_buffer *leaf; @@ -2705,7 +2666,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (seeding_dev) { mutex_lock(&fs_info->chunk_mutex); - ret = init_first_rw_device(trans, fs_info); + ret = init_first_rw_device(trans); mutex_unlock(&fs_info->chunk_mutex); if (ret) { btrfs_abort_transaction(trans, ret); @@ -2722,7 +2683,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (seeding_dev) { char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; - ret = btrfs_finish_sprout(trans, fs_info); + ret = btrfs_finish_sprout(trans); if (ret) { btrfs_abort_transaction(trans, ret); goto error_sysfs; @@ -2852,7 +2813,6 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_super_block *super_copy = fs_info->super_copy; - struct btrfs_fs_devices *fs_devices; u64 old_total; u64 diff; @@ -2871,8 +2831,6 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, return -EINVAL; } - fs_devices = fs_info->fs_devices; - btrfs_set_super_total_bytes(super_copy, round_down(old_total + diff, fs_info->sectorsize)); device->fs_devices->total_rw_bytes += diff; @@ -2880,9 +2838,9 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, btrfs_device_set_total_bytes(device, new_size); btrfs_device_set_disk_total_bytes(device, new_size); btrfs_clear_space_info_full(device->fs_info); - if (list_empty(&device->resized_list)) - list_add_tail(&device->resized_list, - &fs_devices->resized_devices); + if (list_empty(&device->post_commit_list)) + list_add_tail(&device->post_commit_list, + &trans->transaction->dev_update_list); mutex_unlock(&fs_info->chunk_mutex); return btrfs_update_device(trans, device); @@ -3601,10 +3559,10 @@ static int chunk_soft_convert_filter(u64 chunk_type, return 0; } -static int should_balance_chunk(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf, +static int should_balance_chunk(struct extent_buffer *leaf, struct btrfs_chunk *chunk, u64 chunk_offset) { + struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_balance_control *bctl = fs_info->balance_ctl; struct btrfs_balance_args *bargs = NULL; u64 chunk_type = btrfs_chunk_type(leaf, chunk); @@ -3784,8 +3742,7 @@ again: spin_unlock(&fs_info->balance_lock); } - ret = should_balance_chunk(fs_info, leaf, chunk, - found_key.offset); + ret = should_balance_chunk(leaf, chunk, found_key.offset); btrfs_release_path(path); if (!ret) { @@ -4661,8 +4618,7 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) if (IS_ERR(trans)) return PTR_ERR(trans); - uuid_root = btrfs_create_tree(trans, fs_info, - BTRFS_UUID_TREE_OBJECTID); + uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID); if (IS_ERR(uuid_root)) { ret = PTR_ERR(uuid_root); btrfs_abort_transaction(trans, ret); @@ -4722,15 +4678,16 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) int slot; int failed = 0; bool retried = false; - bool checked_pending_chunks = false; struct extent_buffer *l; struct btrfs_key key; struct btrfs_super_block *super_copy = fs_info->super_copy; u64 old_total = btrfs_super_total_bytes(super_copy); u64 old_size = btrfs_device_get_total_bytes(device); u64 diff; + u64 start; new_size = round_down(new_size, fs_info->sectorsize); + start = new_size; diff = round_down(old_size - new_size, fs_info->sectorsize); if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) @@ -4742,6 +4699,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) path->reada = READA_BACK; + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } + mutex_lock(&fs_info->chunk_mutex); btrfs_device_set_total_bytes(device, new_size); @@ -4749,7 +4712,21 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) device->fs_devices->total_rw_bytes -= diff; atomic64_sub(diff, &fs_info->free_chunk_space); } - mutex_unlock(&fs_info->chunk_mutex); + + /* + * Once the device's size has been set to the new size, ensure all + * in-memory chunks are synced to disk so that the loop below sees them + * and relocates them accordingly. + */ + if (contains_pending_extent(device, &start, diff)) { + mutex_unlock(&fs_info->chunk_mutex); + ret = btrfs_commit_transaction(trans); + if (ret) + goto done; + } else { + mutex_unlock(&fs_info->chunk_mutex); + btrfs_end_transaction(trans); + } again: key.objectid = device->devid; @@ -4840,40 +4817,10 @@ again: } mutex_lock(&fs_info->chunk_mutex); - - /* - * We checked in the above loop all device extents that were already in - * the device tree. However before we have updated the device's - * total_bytes to the new size, we might have had chunk allocations that - * have not complete yet (new block groups attached to transaction - * handles), and therefore their device extents were not yet in the - * device tree and we missed them in the loop above. So if we have any - * pending chunk using a device extent that overlaps the device range - * that we can not use anymore, commit the current transaction and - * repeat the search on the device tree - this way we guarantee we will - * not have chunks using device extents that end beyond 'new_size'. - */ - if (!checked_pending_chunks) { - u64 start = new_size; - u64 len = old_size - new_size; - - if (contains_pending_extent(trans->transaction, device, - &start, len)) { - mutex_unlock(&fs_info->chunk_mutex); - checked_pending_chunks = true; - failed = 0; - retried = false; - ret = btrfs_commit_transaction(trans); - if (ret) - goto done; - goto again; - } - } - btrfs_device_set_disk_total_bytes(device, new_size); - if (list_empty(&device->resized_list)) - list_add_tail(&device->resized_list, - &fs_info->fs_devices->resized_devices); + if (list_empty(&device->post_commit_list)) + list_add_tail(&device->post_commit_list, + &trans->transaction->dev_update_list); WARN_ON(diff > old_total); btrfs_set_super_total_bytes(super_copy, @@ -4957,15 +4904,6 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) btrfs_set_fs_incompat(info, RAID56); } -#define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \ - - sizeof(struct btrfs_chunk)) \ - / sizeof(struct btrfs_stripe) + 1) - -#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \ - - 2 * sizeof(struct btrfs_disk_key) \ - - 2 * sizeof(struct btrfs_chunk)) \ - / sizeof(struct btrfs_stripe) + 1) - static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 start, u64 type) { @@ -5038,7 +4976,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, } else { btrfs_err(info, "invalid chunk type 0x%llx requested", type); - BUG_ON(1); + BUG(); } /* We don't want a chunk larger than 10% of writable space */ @@ -5079,7 +5017,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (total_avail == 0) continue; - ret = find_free_dev_extent(trans, device, + ret = find_free_dev_extent(device, max_stripe_size * dev_stripes, &dev_offset, &max_avail); if (ret && ret != -ENOSPC) @@ -5213,18 +5151,20 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, free_extent_map(em); goto error; } - - list_add_tail(&em->list, &trans->transaction->pending_chunks); - refcount_inc(&em->refs); write_unlock(&em_tree->lock); ret = btrfs_make_block_group(trans, 0, type, start, chunk_size); if (ret) goto error_del_extent; - for (i = 0; i < map->num_stripes; i++) - btrfs_device_set_bytes_used(map->stripes[i].dev, - map->stripes[i].dev->bytes_used + stripe_size); + for (i = 0; i < map->num_stripes; i++) { + struct btrfs_device *dev = map->stripes[i].dev; + + btrfs_device_set_bytes_used(dev, dev->bytes_used + stripe_size); + if (list_empty(&dev->post_commit_list)) + list_add_tail(&dev->post_commit_list, + &trans->transaction->dev_update_list); + } atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space); @@ -5243,8 +5183,6 @@ error_del_extent: free_extent_map(em); /* One for the tree reference */ free_extent_map(em); - /* One for the pending_chunks list reference */ - free_extent_map(em); error: kfree(devices_info); return ret; @@ -5364,9 +5302,9 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type) return __btrfs_alloc_chunk(trans, chunk_offset, type); } -static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; u64 chunk_offset; u64 sys_chunk_offset; u64 alloc_profile; @@ -6714,99 +6652,6 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, return dev; } -/* Return -EIO if any error, otherwise return 0. */ -static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf, - struct btrfs_chunk *chunk, u64 logical) -{ - u64 length; - u64 stripe_len; - u16 num_stripes; - u16 sub_stripes; - u64 type; - u64 features; - bool mixed = false; - - length = btrfs_chunk_length(leaf, chunk); - stripe_len = btrfs_chunk_stripe_len(leaf, chunk); - num_stripes = btrfs_chunk_num_stripes(leaf, chunk); - sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); - type = btrfs_chunk_type(leaf, chunk); - - if (!num_stripes) { - btrfs_err(fs_info, "invalid chunk num_stripes: %u", - num_stripes); - return -EIO; - } - if (!IS_ALIGNED(logical, fs_info->sectorsize)) { - btrfs_err(fs_info, "invalid chunk logical %llu", logical); - return -EIO; - } - if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) { - btrfs_err(fs_info, "invalid chunk sectorsize %u", - btrfs_chunk_sector_size(leaf, chunk)); - return -EIO; - } - if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) { - btrfs_err(fs_info, "invalid chunk length %llu", length); - return -EIO; - } - if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) { - btrfs_err(fs_info, "invalid chunk stripe length: %llu", - stripe_len); - return -EIO; - } - if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & - type) { - btrfs_err(fs_info, "unrecognized chunk type: %llu", - ~(BTRFS_BLOCK_GROUP_TYPE_MASK | - BTRFS_BLOCK_GROUP_PROFILE_MASK) & - btrfs_chunk_type(leaf, chunk)); - return -EIO; - } - - if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) { - btrfs_err(fs_info, "missing chunk type flag: 0x%llx", type); - return -EIO; - } - - if ((type & BTRFS_BLOCK_GROUP_SYSTEM) && - (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) { - btrfs_err(fs_info, - "system chunk with data or metadata type: 0x%llx", type); - return -EIO; - } - - features = btrfs_super_incompat_flags(fs_info->super_copy); - if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) - mixed = true; - - if (!mixed) { - if ((type & BTRFS_BLOCK_GROUP_METADATA) && - (type & BTRFS_BLOCK_GROUP_DATA)) { - btrfs_err(fs_info, - "mixed chunk type in non-mixed mode: 0x%llx", type); - return -EIO; - } - } - - if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || - (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) || - (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || - (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || - (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) || - ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && - num_stripes != 1)) { - btrfs_err(fs_info, - "invalid num_stripes:sub_stripes %u:%u for profile %llu", - num_stripes, sub_stripes, - type & BTRFS_BLOCK_GROUP_PROFILE_MASK); - return -EIO; - } - - return 0; -} - static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, u64 devid, u8 *uuid, bool error) { @@ -6818,10 +6663,30 @@ static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, devid, uuid); } -static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, - struct extent_buffer *leaf, +static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) +{ + int index = btrfs_bg_flags_to_raid_index(type); + int ncopies = btrfs_raid_array[index].ncopies; + int data_stripes; + + switch (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + case BTRFS_BLOCK_GROUP_RAID5: + data_stripes = num_stripes - 1; + break; + case BTRFS_BLOCK_GROUP_RAID6: + data_stripes = num_stripes - 2; + break; + default: + data_stripes = num_stripes / ncopies; + break; + } + return div_u64(chunk_len, data_stripes); +} + +static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, struct btrfs_chunk *chunk) { + struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; struct map_lookup *map; struct extent_map *em; @@ -6837,9 +6702,15 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, length = btrfs_chunk_length(leaf, chunk); num_stripes = btrfs_chunk_num_stripes(leaf, chunk); - ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical); - if (ret) - return ret; + /* + * Only need to verify chunk item if we're reading from sys chunk array, + * as chunk item in tree block is already verified by tree-checker. + */ + if (leaf->start == BTRFS_SUPER_INFO_OFFSET) { + ret = btrfs_check_chunk_valid(leaf, chunk, logical); + if (ret) + return ret; + } read_lock(&map_tree->map_tree.lock); em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); @@ -6877,6 +6748,8 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, map->type = btrfs_chunk_type(leaf, chunk); map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); map->verified_stripes = 0; + em->orig_block_len = calc_stripe_length(map->type, em->len, + map->num_stripes); for (i = 0; i < num_stripes; i++) { map->stripes[i].physical = btrfs_stripe_offset_nr(leaf, chunk, i); @@ -7001,10 +6874,10 @@ out: return fs_devices; } -static int read_one_dev(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf, +static int read_one_dev(struct extent_buffer *leaf, struct btrfs_dev_item *dev_item) { + struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; u64 devid; @@ -7193,7 +7066,7 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) if (cur_offset + len > array_size) goto out_short_read; - ret = read_one_chunk(fs_info, &key, sb, chunk); + ret = read_one_chunk(&key, sb, chunk); if (ret) break; } else { @@ -7334,14 +7207,14 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) struct btrfs_dev_item *dev_item; dev_item = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item); - ret = read_one_dev(fs_info, leaf, dev_item); + ret = read_one_dev(leaf, dev_item); if (ret) goto error; total_dev++; } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { struct btrfs_chunk *chunk; chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); - ret = read_one_chunk(fs_info, &found_key, leaf, chunk); + ret = read_one_chunk(&found_key, leaf, chunk); if (ret) goto error; } @@ -7530,9 +7403,9 @@ out: /* * called from commit_transaction. Writes all changed device stats to disk. */ -int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +int btrfs_run_dev_stats(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; int stats_cnt; @@ -7674,51 +7547,34 @@ void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_pat } /* - * Update the size of all devices, which is used for writing out the - * super blocks. + * Update the size and bytes used for each device where it changed. This is + * delayed since we would otherwise get errors while writing out the + * superblocks. + * + * Must be invoked during transaction commit. */ -void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info) +void btrfs_commit_device_sizes(struct btrfs_transaction *trans) { - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *curr, *next; - if (list_empty(&fs_devices->resized_devices)) - return; - - mutex_lock(&fs_devices->device_list_mutex); - mutex_lock(&fs_info->chunk_mutex); - list_for_each_entry_safe(curr, next, &fs_devices->resized_devices, - resized_list) { - list_del_init(&curr->resized_list); - curr->commit_total_bytes = curr->disk_total_bytes; - } - mutex_unlock(&fs_info->chunk_mutex); - mutex_unlock(&fs_devices->device_list_mutex); -} + ASSERT(trans->state == TRANS_STATE_COMMIT_DOING); -/* Must be invoked during the transaction commit */ -void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - struct extent_map *em; - struct map_lookup *map; - struct btrfs_device *dev; - int i; - - if (list_empty(&trans->pending_chunks)) + if (list_empty(&trans->dev_update_list)) return; - /* In order to kick the device replace finish process */ - mutex_lock(&fs_info->chunk_mutex); - list_for_each_entry(em, &trans->pending_chunks, list) { - map = em->map_lookup; - - for (i = 0; i < map->num_stripes; i++) { - dev = map->stripes[i].dev; - dev->commit_bytes_used = dev->bytes_used; - } + /* + * We don't need the device_list_mutex here. This list is owned by the + * transaction and the transaction must complete before the device is + * released. + */ + mutex_lock(&trans->fs_info->chunk_mutex); + list_for_each_entry_safe(curr, next, &trans->dev_update_list, + post_commit_list) { + list_del_init(&curr->post_commit_list); + curr->commit_total_bytes = curr->disk_total_bytes; + curr->commit_bytes_used = curr->bytes_used; } - mutex_unlock(&fs_info->chunk_mutex); + mutex_unlock(&trans->fs_info->chunk_mutex); } void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info) @@ -7751,25 +7607,6 @@ int btrfs_bg_type_to_factor(u64 flags) } -static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) -{ - int index = btrfs_bg_flags_to_raid_index(type); - int ncopies = btrfs_raid_array[index].ncopies; - int data_stripes; - - switch (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { - case BTRFS_BLOCK_GROUP_RAID5: - data_stripes = num_stripes - 1; - break; - case BTRFS_BLOCK_GROUP_RAID6: - data_stripes = num_stripes - 2; - break; - default: - data_stripes = num_stripes / ncopies; - break; - } - return div_u64(chunk_len, data_stripes); -} static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, u64 chunk_offset, u64 devid, diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 3ad9d58d1b66..136a3eb64604 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -45,6 +45,7 @@ struct btrfs_pending_bios { struct btrfs_device { struct list_head dev_list; struct list_head dev_alloc_list; + struct list_head post_commit_list; /* chunk mutex */ struct btrfs_fs_devices *fs_devices; struct btrfs_fs_info *fs_info; @@ -102,18 +103,12 @@ struct btrfs_device { * size of the device on the current transaction * * This variant is update when committing the transaction, - * and protected by device_list_mutex + * and protected by chunk mutex */ u64 commit_total_bytes; /* bytes used on the current transaction */ u64 commit_bytes_used; - /* - * used to manage the device which is resized - * - * It is protected by chunk_lock. - */ - struct list_head resized_list; /* for sending down flush barriers */ struct bio *flush_bio; @@ -123,7 +118,6 @@ struct btrfs_device { struct scrub_ctx *scrub_ctx; struct btrfs_work work; - struct rcu_head rcu; /* readahead state */ atomic_t reada_in_flight; @@ -139,6 +133,8 @@ struct btrfs_device { /* Counter to record the change of device stats */ atomic_t dev_stats_ccnt; atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX]; + + struct extent_io_tree alloc_state; }; /* @@ -235,7 +231,6 @@ struct btrfs_fs_devices { struct mutex device_list_mutex; struct list_head devices; - struct list_head resized_devices; /* devices not currently being allocated */ struct list_head alloc_list; @@ -258,6 +253,15 @@ struct btrfs_fs_devices { #define BTRFS_BIO_INLINE_CSUM_SIZE 64 +#define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \ + - sizeof(struct btrfs_chunk)) \ + / sizeof(struct btrfs_stripe) + 1) + +#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \ + - 2 * sizeof(struct btrfs_disk_key) \ + - 2 * sizeof(struct btrfs_chunk)) \ + / sizeof(struct btrfs_stripe) + 1) + /* * we need the mirror number and stripe index to be passed around * the call chain while we are processing end_io (especially errors). @@ -390,6 +394,7 @@ static inline enum btrfs_map_op btrfs_op(struct bio *bio) return BTRFS_MAP_WRITE; default: WARN_ON_ONCE(1); + /* fall through */ case REQ_OP_READ: return BTRFS_MAP_READ; } @@ -449,22 +454,18 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info); int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset); -int find_free_dev_extent_start(struct btrfs_transaction *transaction, - struct btrfs_device *device, u64 num_bytes, - u64 search_start, u64 *start, u64 *max_avail); -int find_free_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, u64 num_bytes, +int find_free_dev_extent_start(struct btrfs_device *device, u64 num_bytes, + u64 search_start, u64 *start, u64 *max_avail); +int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *max_avail); void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_get_dev_stats *stats); void btrfs_init_devices_late(struct btrfs_fs_info *fs_info); int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); -int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); +int btrfs_run_dev_stats(struct btrfs_trans_handle *trans); void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev); -void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, - struct btrfs_device *srcdev); +void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev); void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev); void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path); int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, @@ -558,8 +559,7 @@ static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags) const char *get_raid_name(enum btrfs_raid_types type); -void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info); -void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans); +void btrfs_commit_device_sizes(struct btrfs_transaction *trans); struct list_head *btrfs_get_fs_uuids(void); void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index f141b45ce349..78b6ba2029e8 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -76,9 +76,8 @@ out: return ret; } -static int do_setxattr(struct btrfs_trans_handle *trans, - struct inode *inode, const char *name, - const void *value, size_t size, int flags) +int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, + const char *name, const void *value, size_t size, int flags) { struct btrfs_dir_item *di = NULL; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -87,6 +86,8 @@ static int do_setxattr(struct btrfs_trans_handle *trans, size_t name_len = strlen(name); int ret = 0; + ASSERT(trans); + if (name_len + size > BTRFS_MAX_XATTR_SIZE(root->fs_info)) return -ENOSPC; @@ -174,7 +175,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans, char *ptr; if (size > old_data_len) { - if (btrfs_leaf_free_space(fs_info, leaf) < + if (btrfs_leaf_free_space(leaf) < (size - old_data_len)) { ret = -ENOSPC; goto out; @@ -184,17 +185,15 @@ static int do_setxattr(struct btrfs_trans_handle *trans, if (old_data_len + name_len + sizeof(*di) == item_size) { /* No other xattrs packed in the same leaf item. */ if (size > old_data_len) - btrfs_extend_item(fs_info, path, - size - old_data_len); + btrfs_extend_item(path, size - old_data_len); else if (size < old_data_len) - btrfs_truncate_item(fs_info, path, - data_size, 1); + btrfs_truncate_item(path, data_size, 1); } else { /* There are other xattrs packed in the same item. */ ret = btrfs_delete_one_dir_name(trans, root, path, di); if (ret) goto out; - btrfs_extend_item(fs_info, path, data_size); + btrfs_extend_item(path, data_size); } item = btrfs_item_nr(slot); @@ -220,24 +219,18 @@ out: /* * @value: "" makes the attribute to empty, NULL removes it */ -int btrfs_setxattr(struct btrfs_trans_handle *trans, - struct inode *inode, const char *name, - const void *value, size_t size, int flags) +int btrfs_setxattr_trans(struct inode *inode, const char *name, + const void *value, size_t size, int flags) { struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; int ret; - if (btrfs_root_readonly(root)) - return -EROFS; - - if (trans) - return do_setxattr(trans, inode, name, value, size, flags); - trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) return PTR_ERR(trans); - ret = do_setxattr(trans, inode, name, value, size, flags); + ret = btrfs_setxattr(trans, inode, name, value, size, flags); if (ret) goto out; @@ -370,7 +363,7 @@ static int btrfs_xattr_handler_set(const struct xattr_handler *handler, size_t size, int flags) { name = xattr_full_name(handler, name); - return btrfs_setxattr(NULL, inode, name, buffer, size, flags); + return btrfs_setxattr_trans(inode, name, buffer, size, flags); } static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, @@ -378,8 +371,32 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, const char *name, const void *value, size_t size, int flags) { + int ret; + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + name = xattr_full_name(handler, name); - return btrfs_set_prop(inode, name, value, size, flags); + ret = btrfs_validate_prop(name, value, size); + if (ret) + return ret; + + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_set_prop(trans, inode, name, value, size, flags); + if (!ret) { + inode_inc_iversion(inode); + inode->i_ctime = current_time(inode); + set_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags); + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + } + + btrfs_end_transaction(trans); + + return ret; } static const struct xattr_handler btrfs_security_xattr_handler = { @@ -419,10 +436,10 @@ const struct xattr_handler *btrfs_xattr_handlers[] = { }; static int btrfs_initxattrs(struct inode *inode, - const struct xattr *xattr_array, void *fs_info) + const struct xattr *xattr_array, void *fs_private) { + struct btrfs_trans_handle *trans = fs_private; const struct xattr *xattr; - struct btrfs_trans_handle *trans = fs_info; unsigned int nofs_flag; char *name; int err = 0; @@ -442,7 +459,7 @@ static int btrfs_initxattrs(struct inode *inode, strcpy(name, XATTR_SECURITY_PREFIX); strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name); err = btrfs_setxattr(trans, inode, name, xattr->value, - xattr->value_len, 0); + xattr->value_len, 0); kfree(name); if (err < 0) break; diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index 471fcac6ff55..1cd3fc0a8f17 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -12,9 +12,10 @@ extern const struct xattr_handler *btrfs_xattr_handlers[]; int btrfs_getxattr(struct inode *inode, const char *name, void *buffer, size_t size); -int btrfs_setxattr(struct btrfs_trans_handle *trans, - struct inode *inode, const char *name, - const void *value, size_t size, int flags); +int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, + const char *name, const void *value, size_t size, int flags); +int btrfs_setxattr_trans(struct inode *inode, const char *name, + const void *value, size_t size, int flags); ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 6b9e29d050f3..a6ff07cf11d5 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -90,6 +90,9 @@ static inline struct workspace *list_to_workspace(struct list_head *list) return container_of(list, struct workspace, list); } +static void zstd_free_workspace(struct list_head *ws); +static struct list_head *zstd_alloc_workspace(unsigned int level); + /* * zstd_reclaim_timer_fn - reclaim timer * @t: timer @@ -124,7 +127,7 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) level = victim->level; list_del(&victim->lru_list); list_del(&victim->list); - wsm.ops->free_workspace(&victim->list); + zstd_free_workspace(&victim->list); if (list_empty(&wsm.idle_ws[level - 1])) clear_bit(level - 1, &wsm.active_map); @@ -180,7 +183,7 @@ static void zstd_init_workspace_manager(void) for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) INIT_LIST_HEAD(&wsm.idle_ws[i]); - ws = wsm.ops->alloc_workspace(ZSTD_BTRFS_MAX_LEVEL); + ws = zstd_alloc_workspace(ZSTD_BTRFS_MAX_LEVEL); if (IS_ERR(ws)) { pr_warn( "BTRFS: cannot preallocate zstd compression workspace\n"); @@ -202,7 +205,7 @@ static void zstd_cleanup_workspace_manager(void) struct workspace, list); list_del(&workspace->list); list_del(&workspace->lru_list); - wsm.ops->free_workspace(&workspace->list); + zstd_free_workspace(&workspace->list); } } spin_unlock(&wsm.lock); @@ -272,7 +275,7 @@ again: return ws; nofs_flag = memalloc_nofs_save(); - ws = wsm.ops->alloc_workspace(level); + ws = zstd_alloc_workspace(level); memalloc_nofs_restore(nofs_flag); if (IS_ERR(ws)) { diff --git a/fs/buffer.c b/fs/buffer.c index ce357602f471..0faa41fb4c88 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2085,7 +2085,7 @@ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, } EXPORT_SYMBOL(block_write_begin); -int __generic_write_end(struct inode *inode, loff_t pos, unsigned copied, +void __generic_write_end(struct inode *inode, loff_t pos, unsigned copied, struct page *page) { loff_t old_size = inode->i_size; @@ -2104,7 +2104,6 @@ int __generic_write_end(struct inode *inode, loff_t pos, unsigned copied, } unlock_page(page); - put_page(page); if (old_size < pos) pagecache_isize_extended(inode, old_size, pos); @@ -2116,7 +2115,6 @@ int __generic_write_end(struct inode *inode, loff_t pos, unsigned copied, */ if (i_size_changed) mark_inode_dirty(inode); - return copied; } int block_write_end(struct file *file, struct address_space *mapping, @@ -2160,7 +2158,9 @@ int generic_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); - return __generic_write_end(mapping->host, pos, copied, page); + __generic_write_end(mapping->host, pos, copied, page); + put_page(page); + return copied; } EXPORT_SYMBOL(generic_write_end); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index a8f429882249..0637149fb9f9 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1766,6 +1766,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) { struct ceph_inode_info *dci = ceph_inode(dir); + unsigned hash; switch (dci->i_dir_layout.dl_dir_hash) { case 0: /* for backward compat */ @@ -1773,8 +1774,11 @@ unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) return dn->d_name.hash; default: - return ceph_str_hash(dci->i_dir_layout.dl_dir_hash, + spin_lock(&dn->d_lock); + hash = ceph_str_hash(dci->i_dir_layout.dl_dir_hash, dn->d_name.name, dn->d_name.len); + spin_unlock(&dn->d_lock); + return hash; } } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 9f53c3d99304..84725b53ac21 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -247,6 +247,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) case S_IFREG: ceph_fscache_register_inode_cookie(inode); ceph_fscache_file_set_cookie(inode, file); + /* fall through */ case S_IFDIR: ret = ceph_init_file_info(inode, file, fmode, S_ISDIR(inode->i_mode)); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 2d61ddda9bf5..35dae6d5493a 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -519,9 +519,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb) return &ci->vfs_inode; } -static void ceph_i_callback(struct rcu_head *head) +void ceph_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); struct ceph_inode_info *ci = ceph_inode(inode); kfree(ci->i_symlink); @@ -581,8 +580,6 @@ void ceph_destroy_inode(struct inode *inode) ceph_buffer_put(ci->i_xattrs.prealloc_blob); ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns)); - - call_rcu(&inode->i_rcu, ceph_i_callback); } int ceph_drop_inode(struct inode *inode) @@ -1163,6 +1160,19 @@ static int splice_dentry(struct dentry **pdn, struct inode *in) return 0; } +static int d_name_cmp(struct dentry *dentry, const char *name, size_t len) +{ + int ret; + + /* take d_lock to ensure dentry->d_name stability */ + spin_lock(&dentry->d_lock); + ret = dentry->d_name.len - len; + if (!ret) + ret = memcmp(dentry->d_name.name, name, len); + spin_unlock(&dentry->d_lock); + return ret; +} + /* * Incorporate results into the local cache. This is either just * one inode, or a directory, dentry, and possibly linked-to inode (e.g., @@ -1412,7 +1422,8 @@ retry_lookup: err = splice_dentry(&req->r_dentry, in); if (err < 0) goto done; - } else if (rinfo->head->is_dentry) { + } else if (rinfo->head->is_dentry && + !d_name_cmp(req->r_dentry, rinfo->dname, rinfo->dname_len)) { struct ceph_vino *ptvino = NULL; if ((le32_to_cpu(rinfo->diri.in->cap.caps) & CEPH_CAP_FILE_SHARED) || diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 21c33ed048ed..9049c2a3e972 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1414,6 +1414,15 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove); ci->i_prealloc_cap_flush = NULL; } + + if (drop && + ci->i_wrbuffer_ref_head == 0 && + ci->i_wr_ref == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + ceph_put_snap_context(ci->i_head_snapc); + ci->i_head_snapc = NULL; + } } spin_unlock(&ci->i_ceph_lock); while (!list_empty(&to_remove)) { @@ -2161,10 +2170,39 @@ retry: return path; } +/* Duplicate the dentry->d_name.name safely */ +static int clone_dentry_name(struct dentry *dentry, const char **ppath, + int *ppathlen) +{ + u32 len; + char *name; + +retry: + len = READ_ONCE(dentry->d_name.len); + name = kmalloc(len + 1, GFP_NOFS); + if (!name) + return -ENOMEM; + + spin_lock(&dentry->d_lock); + if (dentry->d_name.len != len) { + spin_unlock(&dentry->d_lock); + kfree(name); + goto retry; + } + memcpy(name, dentry->d_name.name, len); + spin_unlock(&dentry->d_lock); + + name[len] = '\0'; + *ppath = name; + *ppathlen = len; + return 0; +} + static int build_dentry_path(struct dentry *dentry, struct inode *dir, const char **ppath, int *ppathlen, u64 *pino, - int *pfreepath) + bool *pfreepath, bool parent_locked) { + int ret; char *path; rcu_read_lock(); @@ -2173,8 +2211,15 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir, if (dir && ceph_snap(dir) == CEPH_NOSNAP) { *pino = ceph_ino(dir); rcu_read_unlock(); - *ppath = dentry->d_name.name; - *ppathlen = dentry->d_name.len; + if (parent_locked) { + *ppath = dentry->d_name.name; + *ppathlen = dentry->d_name.len; + } else { + ret = clone_dentry_name(dentry, ppath, ppathlen); + if (ret) + return ret; + *pfreepath = true; + } return 0; } rcu_read_unlock(); @@ -2182,13 +2227,13 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir, if (IS_ERR(path)) return PTR_ERR(path); *ppath = path; - *pfreepath = 1; + *pfreepath = true; return 0; } static int build_inode_path(struct inode *inode, const char **ppath, int *ppathlen, u64 *pino, - int *pfreepath) + bool *pfreepath) { struct dentry *dentry; char *path; @@ -2204,7 +2249,7 @@ static int build_inode_path(struct inode *inode, if (IS_ERR(path)) return PTR_ERR(path); *ppath = path; - *pfreepath = 1; + *pfreepath = true; return 0; } @@ -2215,7 +2260,7 @@ static int build_inode_path(struct inode *inode, static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, struct inode *rdiri, const char *rpath, u64 rino, const char **ppath, int *pathlen, - u64 *ino, int *freepath) + u64 *ino, bool *freepath, bool parent_locked) { int r = 0; @@ -2225,7 +2270,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, ceph_snap(rinode)); } else if (rdentry) { r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino, - freepath); + freepath, parent_locked); dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, *ppath); } else if (rpath || rino) { @@ -2251,7 +2296,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, const char *path2 = NULL; u64 ino1 = 0, ino2 = 0; int pathlen1 = 0, pathlen2 = 0; - int freepath1 = 0, freepath2 = 0; + bool freepath1 = false, freepath2 = false; int len; u16 releases; void *p, *end; @@ -2259,16 +2304,19 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, ret = set_request_path_attr(req->r_inode, req->r_dentry, req->r_parent, req->r_path1, req->r_ino1.ino, - &path1, &pathlen1, &ino1, &freepath1); + &path1, &pathlen1, &ino1, &freepath1, + test_bit(CEPH_MDS_R_PARENT_LOCKED, + &req->r_req_flags)); if (ret < 0) { msg = ERR_PTR(ret); goto out; } + /* If r_old_dentry is set, then assume that its parent is locked */ ret = set_request_path_attr(NULL, req->r_old_dentry, req->r_old_dentry_dir, req->r_path2, req->r_ino2.ino, - &path2, &pathlen2, &ino2, &freepath2); + &path2, &pathlen2, &ino2, &freepath2, true); if (ret < 0) { msg = ERR_PTR(ret); goto out_free1; diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 89aa37fa0f84..b26e12cd8ec3 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -572,7 +572,12 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) old_snapc = NULL; update_snapc: - if (ci->i_head_snapc) { + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_wr_ref == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + ci->i_head_snapc = NULL; + } else { ci->i_head_snapc = ceph_get_snap_context(new_snapc); dout(" new snapc is %p\n", new_snapc); } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 6d5bb2f74612..285edda4fc3b 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -848,6 +848,7 @@ static void ceph_umount_begin(struct super_block *sb) static const struct super_operations ceph_super_ops = { .alloc_inode = ceph_alloc_inode, .destroy_inode = ceph_destroy_inode, + .free_inode = ceph_free_inode, .write_inode = ceph_write_inode, .drop_inode = ceph_drop_inode, .sync_fs = ceph_sync_fs, diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 16c03188578e..c5b4a05905c0 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -874,6 +874,7 @@ extern const struct inode_operations ceph_file_iops; extern struct inode *ceph_alloc_inode(struct super_block *sb); extern void ceph_destroy_inode(struct inode *inode); +extern void ceph_free_inode(struct inode *inode); extern int ceph_drop_inode(struct inode *inode); extern struct inode *ceph_get_inode(struct super_block *sb, diff --git a/fs/char_dev.c b/fs/char_dev.c index a279c58fe360..d18cad28c1c3 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -88,22 +88,31 @@ static int find_dynamic_major(void) /* * Register a single major with a specified minor range. * - * If major == 0 this functions will dynamically allocate a major and return - * its number. - * - * If major > 0 this function will attempt to reserve the passed range of - * minors and will return zero on success. + * If major == 0 this function will dynamically allocate an unused major. + * If major > 0 this function will attempt to reserve the range of minors + * with given major. * - * Returns a -ve errno on failure. */ static struct char_device_struct * __register_chrdev_region(unsigned int major, unsigned int baseminor, int minorct, const char *name) { - struct char_device_struct *cd, **cp; - int ret = 0; + struct char_device_struct *cd, *curr, *prev = NULL; + int ret = -EBUSY; int i; + if (major >= CHRDEV_MAJOR_MAX) { + pr_err("CHRDEV \"%s\" major requested (%u) is greater than the maximum (%u)\n", + name, major, CHRDEV_MAJOR_MAX-1); + return ERR_PTR(-EINVAL); + } + + if (minorct > MINORMASK + 1 - baseminor) { + pr_err("CHRDEV \"%s\" minor range requested (%u-%u) is out of range of maximum range (%u-%u) for a single major\n", + name, baseminor, baseminor + minorct - 1, 0, MINORMASK); + return ERR_PTR(-EINVAL); + } + cd = kzalloc(sizeof(struct char_device_struct), GFP_KERNEL); if (cd == NULL) return ERR_PTR(-ENOMEM); @@ -120,10 +129,20 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor, major = ret; } - if (major >= CHRDEV_MAJOR_MAX) { - pr_err("CHRDEV \"%s\" major requested (%u) is greater than the maximum (%u)\n", - name, major, CHRDEV_MAJOR_MAX-1); - ret = -EINVAL; + i = major_to_index(major); + for (curr = chrdevs[i]; curr; prev = curr, curr = curr->next) { + if (curr->major < major) + continue; + + if (curr->major > major) + break; + + if (curr->baseminor + curr->minorct <= baseminor) + continue; + + if (curr->baseminor >= baseminor + minorct) + break; + goto out; } @@ -132,37 +151,14 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor, cd->minorct = minorct; strlcpy(cd->name, name, sizeof(cd->name)); - i = major_to_index(major); - - for (cp = &chrdevs[i]; *cp; cp = &(*cp)->next) - if ((*cp)->major > major || - ((*cp)->major == major && - (((*cp)->baseminor >= baseminor) || - ((*cp)->baseminor + (*cp)->minorct > baseminor)))) - break; - - /* Check for overlapping minor ranges. */ - if (*cp && (*cp)->major == major) { - int old_min = (*cp)->baseminor; - int old_max = (*cp)->baseminor + (*cp)->minorct - 1; - int new_min = baseminor; - int new_max = baseminor + minorct - 1; - - /* New driver overlaps from the left. */ - if (new_max >= old_min && new_max <= old_max) { - ret = -EBUSY; - goto out; - } - - /* New driver overlaps from the right. */ - if (new_min <= old_max && new_min >= old_min) { - ret = -EBUSY; - goto out; - } + if (!prev) { + cd->next = curr; + chrdevs[i] = cd; + } else { + cd->next = prev->next; + prev->next = cd; } - cd->next = *cp; - *cp = cd; mutex_unlock(&chrdevs_lock); return cd; out: diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index f9b71c12cc9f..877174761efb 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -315,16 +315,10 @@ cifs_alloc_inode(struct super_block *sb) return &cifs_inode->vfs_inode; } -static void cifs_i_callback(struct rcu_head *head) -{ - struct inode *inode = container_of(head, struct inode, i_rcu); - kmem_cache_free(cifs_inode_cachep, CIFS_I(inode)); -} - static void -cifs_destroy_inode(struct inode *inode) +cifs_free_inode(struct inode *inode) { - call_rcu(&inode->i_rcu, cifs_i_callback); + kmem_cache_free(cifs_inode_cachep, CIFS_I(inode)); } static void @@ -559,6 +553,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) tcon->ses->server->echo_interval / HZ); if (tcon->snapshot_time) seq_printf(s, ",snapshot=%llu", tcon->snapshot_time); + if (tcon->handle_timeout) + seq_printf(s, ",handletimeout=%u", tcon->handle_timeout); /* convert actimeo and display it in seconds */ seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ); @@ -628,7 +624,7 @@ static int cifs_drop_inode(struct inode *inode) static const struct super_operations cifs_super_ops = { .statfs = cifs_statfs, .alloc_inode = cifs_alloc_inode, - .destroy_inode = cifs_destroy_inode, + .free_inode = cifs_free_inode, .drop_inode = cifs_drop_inode, .evict_inode = cifs_evict_inode, /* .delete_inode = cifs_delete_inode, */ /* Do not need above diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 38feae812b47..585ad3207cb1 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -60,6 +60,12 @@ #define CIFS_MAX_ACTIMEO (1 << 30) /* + * Max persistent and resilient handle timeout (milliseconds). + * Windows durable max was 960000 (16 minutes) + */ +#define SMB3_MAX_HANDLE_TIMEOUT 960000 + +/* * MAX_REQ is the maximum number of requests that WE will send * on one socket concurrently. */ @@ -586,6 +592,7 @@ struct smb_vol { struct nls_table *local_nls; unsigned int echo_interval; /* echo interval in secs */ __u64 snapshot_time; /* needed for timewarp tokens */ + __u32 handle_timeout; /* persistent and durable handle timeout in ms */ unsigned int max_credits; /* smb3 max_credits 10 < credits < 60000 */ }; @@ -1058,6 +1065,7 @@ struct cifs_tcon { __u32 vol_serial_number; __le64 vol_create_time; __u64 snapshot_time; /* for timewarp tokens - timestamp of snapshot */ + __u32 handle_timeout; /* persistent and durable handle timeout in ms */ __u32 ss_flags; /* sector size flags */ __u32 perf_sector_size; /* best sector size for perf */ __u32 max_chunks; @@ -1325,6 +1333,7 @@ cifsFileInfo_get_locked(struct cifsFileInfo *cifs_file) } struct cifsFileInfo *cifsFileInfo_get(struct cifsFileInfo *cifs_file); +void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_hdlr); void cifsFileInfo_put(struct cifsFileInfo *cifs_file); #define CIFS_CACHE_READ_FLG 1 @@ -1847,6 +1856,7 @@ GLOBAL_EXTERN spinlock_t gidsidlock; #endif /* CONFIG_CIFS_ACL */ void cifs_oplock_break(struct work_struct *work); +void cifs_queue_oplock_break(struct cifsFileInfo *cfile); extern const struct slow_work_ops cifs_oplock_break_ops; extern struct workqueue_struct *cifsiod_wq; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index a8e9738db691..4c0e44489f21 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -103,7 +103,7 @@ enum { Opt_cruid, Opt_gid, Opt_file_mode, Opt_dirmode, Opt_port, Opt_blocksize, Opt_rsize, Opt_wsize, Opt_actimeo, - Opt_echo_interval, Opt_max_credits, + Opt_echo_interval, Opt_max_credits, Opt_handletimeout, Opt_snapshot, /* Mount options which take string value */ @@ -208,6 +208,7 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_rsize, "rsize=%s" }, { Opt_wsize, "wsize=%s" }, { Opt_actimeo, "actimeo=%s" }, + { Opt_handletimeout, "handletimeout=%s" }, { Opt_echo_interval, "echo_interval=%s" }, { Opt_max_credits, "max_credits=%s" }, { Opt_snapshot, "snapshot=%s" }, @@ -1619,6 +1620,9 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->actimeo = CIFS_DEF_ACTIMEO; + /* Most clients set timeout to 0, allows server to use its default */ + vol->handle_timeout = 0; /* See MS-SMB2 spec section 2.2.14.2.12 */ + /* offer SMB2.1 and later (SMB3 etc). Secure and widely accepted */ vol->ops = &smb30_operations; vol->vals = &smbdefault_values; @@ -2017,6 +2021,18 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, goto cifs_parse_mount_err; } break; + case Opt_handletimeout: + if (get_option_ul(args, &option)) { + cifs_dbg(VFS, "%s: Invalid handletimeout value\n", + __func__); + goto cifs_parse_mount_err; + } + vol->handle_timeout = option; + if (vol->handle_timeout > SMB3_MAX_HANDLE_TIMEOUT) { + cifs_dbg(VFS, "Invalid handle cache timeout, longer than 16 minutes\n"); + goto cifs_parse_mount_err; + } + break; case Opt_echo_interval: if (get_option_ul(args, &option)) { cifs_dbg(VFS, "%s: Invalid echo interval value\n", @@ -3183,6 +3199,8 @@ static int match_tcon(struct cifs_tcon *tcon, struct smb_vol *volume_info) return 0; if (tcon->snapshot_time != volume_info->snapshot_time) return 0; + if (tcon->handle_timeout != volume_info->handle_timeout) + return 0; return 1; } @@ -3297,6 +3315,16 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) tcon->snapshot_time = volume_info->snapshot_time; } + if (volume_info->handle_timeout) { + if (ses->server->vals->protocol_id == 0) { + cifs_dbg(VFS, + "Use SMB2.1 or later for handle timeout option\n"); + rc = -EOPNOTSUPP; + goto out_fail; + } else + tcon->handle_timeout = volume_info->handle_timeout; + } + tcon->ses = ses; if (volume_info->password) { tcon->password = kstrdup(volume_info->password, GFP_KERNEL); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 89006e044973..7037a137fa53 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -360,13 +360,31 @@ cifsFileInfo_get(struct cifsFileInfo *cifs_file) return cifs_file; } -/* - * Release a reference on the file private data. This may involve closing - * the filehandle out on the server. Must be called without holding - * tcon->open_file_lock and cifs_file->file_info_lock. +/** + * cifsFileInfo_put - release a reference of file priv data + * + * Always potentially wait for oplock handler. See _cifsFileInfo_put(). */ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) { + _cifsFileInfo_put(cifs_file, true); +} + +/** + * _cifsFileInfo_put - release a reference of file priv data + * + * This may involve closing the filehandle @cifs_file out on the + * server. Must be called without holding tcon->open_file_lock and + * cifs_file->file_info_lock. + * + * If @wait_for_oplock_handler is true and we are releasing the last + * reference, wait for any running oplock break handler of the file + * and cancel any pending one. If calling this function from the + * oplock break handler, you need to pass false. + * + */ +void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) +{ struct inode *inode = d_inode(cifs_file->dentry); struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink); struct TCP_Server_Info *server = tcon->ses->server; @@ -414,7 +432,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) spin_unlock(&tcon->open_file_lock); - oplock_break_cancelled = cancel_work_sync(&cifs_file->oplock_break); + oplock_break_cancelled = wait_oplock_handler ? + cancel_work_sync(&cifs_file->oplock_break) : false; if (!tcon->need_reconnect && !cifs_file->invalidHandle) { struct TCP_Server_Info *server = tcon->ses->server; @@ -2858,7 +2877,6 @@ static void collect_uncached_write_data(struct cifs_aio_ctx *ctx) struct cifs_tcon *tcon; struct cifs_sb_info *cifs_sb; struct dentry *dentry = ctx->cfile->dentry; - unsigned int i; int rc; tcon = tlink_tcon(ctx->cfile->tlink); @@ -2922,10 +2940,6 @@ restart_loop: kref_put(&wdata->refcount, cifs_uncached_writedata_release); } - if (!ctx->direct_io) - for (i = 0; i < ctx->npages; i++) - put_page(ctx->bv[i].bv_page); - cifs_stats_bytes_written(tcon, ctx->total_len); set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(dentry->d_inode)->flags); @@ -3563,7 +3577,6 @@ collect_uncached_read_data(struct cifs_aio_ctx *ctx) struct iov_iter *to = &ctx->iter; struct cifs_sb_info *cifs_sb; struct cifs_tcon *tcon; - unsigned int i; int rc; tcon = tlink_tcon(ctx->cfile->tlink); @@ -3647,15 +3660,8 @@ again: kref_put(&rdata->refcount, cifs_uncached_readdata_release); } - if (!ctx->direct_io) { - for (i = 0; i < ctx->npages; i++) { - if (ctx->should_dirty) - set_page_dirty(ctx->bv[i].bv_page); - put_page(ctx->bv[i].bv_page); - } - + if (!ctx->direct_io) ctx->total_len = ctx->len - iov_iter_count(to); - } /* mask nodata case */ if (rc == -ENODATA) @@ -4603,6 +4609,7 @@ void cifs_oplock_break(struct work_struct *work) cinode); cifs_dbg(FYI, "Oplock release rc = %d\n", rc); } + _cifsFileInfo_put(cfile, false /* do not wait for ourself */); cifs_done_oplock_break(cinode); } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 53fdb5df0d2e..538fd7d807e4 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1735,6 +1735,10 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, if (rc == 0 || rc != -EBUSY) goto do_rename_exit; + /* Don't fall back to using SMB on SMB 2+ mount */ + if (server->vals->protocol_id != 0) + goto do_rename_exit; + /* open-file renames don't work across directories */ if (to_dentry->d_parent != from_dentry->d_parent) goto do_rename_exit; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index bee203055b30..b1a696a73f7c 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -501,8 +501,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &pCifsInode->flags); - queue_work(cifsoplockd_wq, - &netfile->oplock_break); + cifs_queue_oplock_break(netfile); netfile->oplock_break_cancelled = false; spin_unlock(&tcon->open_file_lock); @@ -607,6 +606,28 @@ void cifs_put_writer(struct cifsInodeInfo *cinode) spin_unlock(&cinode->writers_lock); } +/** + * cifs_queue_oplock_break - queue the oplock break handler for cfile + * + * This function is called from the demultiplex thread when it + * receives an oplock break for @cfile. + * + * Assumes the tcon->open_file_lock is held. + * Assumes cfile->file_info_lock is NOT held. + */ +void cifs_queue_oplock_break(struct cifsFileInfo *cfile) +{ + /* + * Bump the handle refcount now while we hold the + * open_file_lock to enforce the validity of it for the oplock + * break handler. The matching put is done at the end of the + * handler. + */ + cifsFileInfo_get(cfile); + + queue_work(cifsoplockd_wq, &cfile->oplock_break); +} + void cifs_done_oplock_break(struct cifsInodeInfo *cinode) { clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags); @@ -768,6 +789,11 @@ cifs_aio_ctx_alloc(void) { struct cifs_aio_ctx *ctx; + /* + * Must use kzalloc to initialize ctx->bv to NULL and ctx->direct_io + * to false so that we know when we have to unreference pages within + * cifs_aio_ctx_release() + */ ctx = kzalloc(sizeof(struct cifs_aio_ctx), GFP_KERNEL); if (!ctx) return NULL; @@ -786,7 +812,23 @@ cifs_aio_ctx_release(struct kref *refcount) struct cifs_aio_ctx, refcount); cifsFileInfo_put(ctx->cfile); - kvfree(ctx->bv); + + /* + * ctx->bv is only set if setup_aio_ctx_iter() was call successfuly + * which means that iov_iter_get_pages() was a success and thus that + * we have taken reference on pages. + */ + if (ctx->bv) { + unsigned i; + + for (i = 0; i < ctx->npages; i++) { + if (ctx->should_dirty) + set_page_dirty(ctx->bv[i].bv_page); + put_page(ctx->bv[i].bv_page); + } + kvfree(ctx->bv); + } + kfree(ctx); } @@ -917,7 +959,6 @@ cifs_alloc_hash(const char *name, } (*sdesc)->shash.tfm = *shash; - (*sdesc)->shash.flags = 0x0; return 0; } diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index b204e84b87fb..54bffb2a1786 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -68,13 +68,15 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, if (oparms->tcon->use_resilient) { - nr_ioctl_req.Timeout = 0; /* use server default (120 seconds) */ + /* default timeout is 0, servers pick default (120 seconds) */ + nr_ioctl_req.Timeout = + cpu_to_le32(oparms->tcon->handle_timeout); nr_ioctl_req.Reserved = 0; rc = SMB2_ioctl(xid, oparms->tcon, fid->persistent_fid, fid->volatile_fid, FSCTL_LMR_REQUEST_RESILIENCY, true /* is_fsctl */, (char *)&nr_ioctl_req, sizeof(nr_ioctl_req), - NULL, NULL /* no return info */); + CIFSMaxBufSize, NULL, NULL /* no return info */); if (rc == -EOPNOTSUPP) { cifs_dbg(VFS, "resiliency not supported by server, disabling\n"); diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 0e3570e40ff8..e311f58dc1c8 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -555,7 +555,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, clear_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags); - queue_work(cifsoplockd_wq, &cfile->oplock_break); + cifs_queue_oplock_break(cfile); kfree(lw); return true; } @@ -712,8 +712,8 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags); spin_unlock(&cfile->file_info_lock); - queue_work(cifsoplockd_wq, - &cfile->oplock_break); + + cifs_queue_oplock_break(cfile); spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 1022a3771e14..c36ff0d1fe2a 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -581,7 +581,7 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, FSCTL_QUERY_NETWORK_INTERFACE_INFO, true /* is_fsctl */, NULL /* no data input */, 0 /* no data input */, - (char **)&out_buf, &ret_data_len); + CIFSMaxBufSize, (char **)&out_buf, &ret_data_len); if (rc == -EOPNOTSUPP) { cifs_dbg(FYI, "server does not support query network interfaces\n"); @@ -717,32 +717,28 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) oparms.fid->mid = le64_to_cpu(o_rsp->sync_hdr.MessageId); #endif /* CIFS_DEBUG2 */ - if (o_rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) - oplock = smb2_parse_lease_state(server, o_rsp, - &oparms.fid->epoch, - oparms.fid->lease_key); - else - goto oshr_exit; - - memcpy(tcon->crfid.fid, pfid, sizeof(struct cifs_fid)); tcon->crfid.tcon = tcon; tcon->crfid.is_valid = true; kref_init(&tcon->crfid.refcount); - kref_get(&tcon->crfid.refcount); + if (o_rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) { + kref_get(&tcon->crfid.refcount); + oplock = smb2_parse_lease_state(server, o_rsp, + &oparms.fid->epoch, + oparms.fid->lease_key); + } else + goto oshr_exit; qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base; if (le32_to_cpu(qi_rsp->OutputBufferLength) < sizeof(struct smb2_file_all_info)) goto oshr_exit; - rc = smb2_validate_and_copy_iov( + if (!smb2_validate_and_copy_iov( le16_to_cpu(qi_rsp->OutputBufferOffset), sizeof(struct smb2_file_all_info), &rsp_iov[1], sizeof(struct smb2_file_all_info), - (char *)&tcon->crfid.file_all_info); - if (rc) - goto oshr_exit; - tcon->crfid.file_all_info_is_valid = 1; + (char *)&tcon->crfid.file_all_info)) + tcon->crfid.file_all_info_is_valid = 1; oshr_exit: mutex_unlock(&tcon->crfid.fid_mutex); @@ -1299,7 +1295,7 @@ SMB2_request_res_key(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid, FSCTL_SRV_REQUEST_RESUME_KEY, true /* is_fsctl */, - NULL, 0 /* no input */, + NULL, 0 /* no input */, CIFSMaxBufSize, (char **)&res_key, &ret_data_len); if (rc) { @@ -1404,7 +1400,7 @@ smb2_ioctl_query_info(const unsigned int xid, rc = SMB2_ioctl_init(tcon, &rqst[1], COMPOUND_FID, COMPOUND_FID, qi.info_type, true, NULL, - 0); + 0, CIFSMaxBufSize); } } else if (qi.flags == PASSTHRU_QUERY_INFO) { memset(&qi_iov, 0, sizeof(qi_iov)); @@ -1532,8 +1528,8 @@ smb2_copychunk_range(const unsigned int xid, rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid, trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE, true /* is_fsctl */, (char *)pcchunk, - sizeof(struct copychunk_ioctl), (char **)&retbuf, - &ret_data_len); + sizeof(struct copychunk_ioctl), CIFSMaxBufSize, + (char **)&retbuf, &ret_data_len); if (rc == 0) { if (ret_data_len != sizeof(struct copychunk_ioctl_rsp)) { @@ -1693,7 +1689,7 @@ static bool smb2_set_sparse(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, FSCTL_SET_SPARSE, true /* is_fctl */, - &setsparse, 1, NULL, NULL); + &setsparse, 1, CIFSMaxBufSize, NULL, NULL); if (rc) { tcon->broken_sparse_sup = true; cifs_dbg(FYI, "set sparse rc = %d\n", rc); @@ -1766,7 +1762,7 @@ smb2_duplicate_extents(const unsigned int xid, true /* is_fsctl */, (char *)&dup_ext_buf, sizeof(struct duplicate_extents_to_file), - NULL, + CIFSMaxBufSize, NULL, &ret_data_len); if (ret_data_len > 0) @@ -1801,7 +1797,7 @@ smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon, true /* is_fsctl */, (char *)&integr_info, sizeof(struct fsctl_set_integrity_information_req), - NULL, + CIFSMaxBufSize, NULL, &ret_data_len); } @@ -1809,6 +1805,8 @@ smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon, /* GMT Token is @GMT-YYYY.MM.DD-HH.MM.SS Unicode which is 48 bytes + null */ #define GMT_TOKEN_SIZE 50 +#define MIN_SNAPSHOT_ARRAY_SIZE 16 /* See MS-SMB2 section 3.3.5.15.1 */ + /* * Input buffer contains (empty) struct smb_snapshot array with size filled in * For output see struct SRV_SNAPSHOT_ARRAY in MS-SMB2 section 2.2.32.2 @@ -1820,13 +1818,29 @@ smb3_enum_snapshots(const unsigned int xid, struct cifs_tcon *tcon, char *retbuf = NULL; unsigned int ret_data_len = 0; int rc; + u32 max_response_size; struct smb_snapshot_array snapshot_in; + if (get_user(ret_data_len, (unsigned int __user *)ioc_buf)) + return -EFAULT; + + /* + * Note that for snapshot queries that servers like Azure expect that + * the first query be minimal size (and just used to get the number/size + * of previous versions) so response size must be specified as EXACTLY + * sizeof(struct snapshot_array) which is 16 when rounded up to multiple + * of eight bytes. + */ + if (ret_data_len == 0) + max_response_size = MIN_SNAPSHOT_ARRAY_SIZE; + else + max_response_size = CIFSMaxBufSize; + rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, FSCTL_SRV_ENUMERATE_SNAPSHOTS, true /* is_fsctl */, - NULL, 0 /* no input data */, + NULL, 0 /* no input data */, max_response_size, (char **)&retbuf, &ret_data_len); cifs_dbg(FYI, "enum snaphots ioctl returned %d and ret buflen is %d\n", @@ -2304,7 +2318,7 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses, rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, FSCTL_DFS_GET_REFERRALS, true /* is_fsctl */, - (char *)dfs_req, dfs_req_size, + (char *)dfs_req, dfs_req_size, CIFSMaxBufSize, (char **)&dfs_rsp, &dfs_rsp_size); } while (rc == -EAGAIN); @@ -2375,6 +2389,8 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, &err_iov, &resp_buftype); + if (!rc) + SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); if (!rc || !err_iov.iov_base) { rc = -ENOENT; goto free_path; @@ -2658,7 +2674,8 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, rc = SMB2_ioctl_init(tcon, &rqst[num++], cfile->fid.persistent_fid, cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA, true /* is_fctl */, (char *)&fsctl_buf, - sizeof(struct file_zero_data_information)); + sizeof(struct file_zero_data_information), + CIFSMaxBufSize); if (rc) goto zero_range_exit; @@ -2735,7 +2752,8 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA, true /* is_fctl */, (char *)&fsctl_buf, - sizeof(struct file_zero_data_information), NULL, NULL); + sizeof(struct file_zero_data_information), + CIFSMaxBufSize, NULL, NULL); free_xid(xid); return rc; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 21ac19ff19cb..a37774a55f3a 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -832,8 +832,11 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) } else if (rsp->DialectRevision == cpu_to_le16(SMB21_PROT_ID)) { /* ops set to 3.0 by default for default so update */ ses->server->ops = &smb21_operations; - } else if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID)) + ses->server->vals = &smb21_values; + } else if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID)) { ses->server->ops = &smb311_operations; + ses->server->vals = &smb311_values; + } } else if (le16_to_cpu(rsp->DialectRevision) != ses->server->vals->protocol_id) { /* if requested single dialect ensure returned dialect matched */ @@ -1002,7 +1005,8 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */, - (char *)pneg_inbuf, inbuflen, (char **)&pneg_rsp, &rsplen); + (char *)pneg_inbuf, inbuflen, CIFSMaxBufSize, + (char **)&pneg_rsp, &rsplen); if (rc == -EOPNOTSUPP) { /* * Old Windows versions or Netapp SMB server can return @@ -1858,8 +1862,9 @@ add_lease_context(struct TCP_Server_Info *server, struct kvec *iov, } static struct create_durable_v2 * -create_durable_v2_buf(struct cifs_fid *pfid) +create_durable_v2_buf(struct cifs_open_parms *oparms) { + struct cifs_fid *pfid = oparms->fid; struct create_durable_v2 *buf; buf = kzalloc(sizeof(struct create_durable_v2), GFP_KERNEL); @@ -1873,7 +1878,14 @@ create_durable_v2_buf(struct cifs_fid *pfid) (struct create_durable_v2, Name)); buf->ccontext.NameLength = cpu_to_le16(4); - buf->dcontext.Timeout = 0; /* Should this be configurable by workload */ + /* + * NB: Handle timeout defaults to 0, which allows server to choose + * (most servers default to 120 seconds) and most clients default to 0. + * This can be overridden at mount ("handletimeout=") if the user wants + * a different persistent (or resilient) handle timeout for all opens + * opens on a particular SMB3 mount. + */ + buf->dcontext.Timeout = cpu_to_le32(oparms->tcon->handle_timeout); buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT); generate_random_uuid(buf->dcontext.CreateGuid); memcpy(pfid->create_guid, buf->dcontext.CreateGuid, 16); @@ -1926,7 +1938,7 @@ add_durable_v2_context(struct kvec *iov, unsigned int *num_iovec, struct smb2_create_req *req = iov[0].iov_base; unsigned int num = *num_iovec; - iov[num].iov_base = create_durable_v2_buf(oparms->fid); + iov[num].iov_base = create_durable_v2_buf(oparms); if (iov[num].iov_base == NULL) return -ENOMEM; iov[num].iov_len = sizeof(struct create_durable_v2); @@ -2478,7 +2490,8 @@ creat_exit: int SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, u64 persistent_fid, u64 volatile_fid, u32 opcode, - bool is_fsctl, char *in_data, u32 indatalen) + bool is_fsctl, char *in_data, u32 indatalen, + __u32 max_response_size) { struct smb2_ioctl_req *req; struct kvec *iov = rqst->rq_iov; @@ -2520,16 +2533,21 @@ SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, req->OutputCount = 0; /* MBZ */ /* - * Could increase MaxOutputResponse, but that would require more - * than one credit. Windows typically sets this smaller, but for some + * In most cases max_response_size is set to 16K (CIFSMaxBufSize) + * We Could increase default MaxOutputResponse, but that could require + * more credits. Windows typically sets this smaller, but for some * ioctls it may be useful to allow server to send more. No point * limiting what the server can send as long as fits in one credit - * Unfortunately - we can not handle more than CIFS_MAX_MSG_SIZE - * (by default, note that it can be overridden to make max larger) - * in responses (except for read responses which can be bigger. - * We may want to bump this limit up + * We can not handle more than CIFS_MAX_BUF_SIZE yet but may want + * to increase this limit up in the future. + * Note that for snapshot queries that servers like Azure expect that + * the first query be minimal size (and just used to get the number/size + * of previous versions) so response size must be specified as EXACTLY + * sizeof(struct snapshot_array) which is 16 when rounded up to multiple + * of eight bytes. Currently that is the only case where we set max + * response size smaller. */ - req->MaxOutputResponse = cpu_to_le32(CIFSMaxBufSize); + req->MaxOutputResponse = cpu_to_le32(max_response_size); if (is_fsctl) req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL); @@ -2550,13 +2568,14 @@ SMB2_ioctl_free(struct smb_rqst *rqst) cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */ } + /* * SMB2 IOCTL is used for both IOCTLs and FSCTLs */ int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, u32 opcode, bool is_fsctl, - char *in_data, u32 indatalen, + char *in_data, u32 indatalen, u32 max_out_data_len, char **out_data, u32 *plen /* returned data len */) { struct smb_rqst rqst; @@ -2593,8 +2612,8 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, rqst.rq_iov = iov; rqst.rq_nvec = SMB2_IOCTL_IOV_SIZE; - rc = SMB2_ioctl_init(tcon, &rqst, persistent_fid, volatile_fid, - opcode, is_fsctl, in_data, indatalen); + rc = SMB2_ioctl_init(tcon, &rqst, persistent_fid, volatile_fid, opcode, + is_fsctl, in_data, indatalen, max_out_data_len); if (rc) goto ioctl_exit; @@ -2672,7 +2691,8 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid, FSCTL_SET_COMPRESSION, true /* is_fsctl */, (char *)&fsctl_input /* data input */, - 2 /* in data len */, &ret_data /* out data */, NULL); + 2 /* in data len */, CIFSMaxBufSize /* max out data */, + &ret_data /* out data */, NULL); cifs_dbg(FYI, "set compression rc %d\n", rc); @@ -3431,8 +3451,6 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, rqst.rq_nvec = 1; rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); - cifs_small_buf_release(req); - rsp = (struct smb2_read_rsp *)rsp_iov.iov_base; if (rc) { @@ -3448,12 +3466,15 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, io_parms->tcon->tid, ses->Suid, io_parms->offset, 0); free_rsp_buf(resp_buftype, rsp_iov.iov_base); + cifs_small_buf_release(req); return rc == -ENODATA ? 0 : rc; } else trace_smb3_read_done(xid, req->PersistentFileId, io_parms->tcon->tid, ses->Suid, io_parms->offset, io_parms->length); + cifs_small_buf_release(req); + *nbytes = le32_to_cpu(rsp->DataLength); if ((*nbytes > CIFS_MAX_MSGSIZE) || (*nbytes > io_parms->length)) { @@ -3752,7 +3773,6 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, rc = cifs_send_recv(xid, io_parms->tcon->ses, &rqst, &resp_buftype, flags, &rsp_iov); - cifs_small_buf_release(req); rsp = (struct smb2_write_rsp *)rsp_iov.iov_base; if (rc) { @@ -3770,6 +3790,7 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, io_parms->offset, *nbytes); } + cifs_small_buf_release(req); free_rsp_buf(resp_buftype, rsp); return rc; } diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 3c32d0cfea69..52df125e9189 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -142,11 +142,12 @@ extern int SMB2_open_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, extern void SMB2_open_free(struct smb_rqst *rqst); extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, u32 opcode, - bool is_fsctl, char *in_data, u32 indatalen, + bool is_fsctl, char *in_data, u32 indatalen, u32 maxoutlen, char **out_data, u32 *plen /* returned data len */); extern int SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, u64 persistent_fid, u64 volatile_fid, u32 opcode, - bool is_fsctl, char *in_data, u32 indatalen); + bool is_fsctl, char *in_data, u32 indatalen, + __u32 max_response_size); extern void SMB2_ioctl_free(struct smb_rqst *rqst); extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id); diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 97424cf206c0..23f6ebd08e80 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -54,17 +54,11 @@ static struct inode *coda_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void coda_i_callback(struct rcu_head *head) +static void coda_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(coda_inode_cachep, ITOC(inode)); } -static void coda_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, coda_i_callback); -} - static void init_once(void *foo) { struct coda_inode_info *ei = (struct coda_inode_info *) foo; @@ -104,7 +98,7 @@ static int coda_remount(struct super_block *sb, int *flags, char *data) static const struct super_operations coda_super_operations = { .alloc_inode = coda_alloc_inode, - .destroy_inode = coda_destroy_inode, + .free_inode = coda_free_inode, .evict_inode = coda_evict_inode, .put_super = coda_put_super, .statfs = coda_statfs, diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 39843fa7e11b..591e82ba443c 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1690,9 +1690,11 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence) switch (whence) { case 1: offset += file->f_pos; + /* fall through */ case 0: if (offset >= 0) break; + /* fall through */ default: return -EINVAL; } diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 5759bcd018cd..8f3a8bc15d98 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -29,10 +29,9 @@ static void __fscrypt_decrypt_bio(struct bio *bio, bool done) { struct bio_vec *bv; - int i; struct bvec_iter_all iter_all; - bio_for_each_segment_all(bv, bio, i, iter_all) { + bio_for_each_segment_all(bv, bio, iter_all) { struct page *page = bv->bv_page; int ret = fscrypt_decrypt_page(page->mapping->host, page, PAGE_SIZE, 0, page->index); diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 322ce9686bdb..2cb4956f8511 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -402,7 +402,6 @@ static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt) { SHASH_DESC_ON_STACK(desc, tfm); desc->tfm = tfm; - desc->flags = 0; return crypto_shash_digest(desc, key, keysize, salt); } @@ -33,6 +33,7 @@ #include <linux/sizes.h> #include <linux/mmu_notifier.h> #include <linux/iomap.h> +#include <asm/pgalloc.h> #include "internal.h" #define CREATE_TRACE_POINTS @@ -1407,7 +1408,9 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, { struct address_space *mapping = vmf->vma->vm_file->f_mapping; unsigned long pmd_addr = vmf->address & PMD_MASK; + struct vm_area_struct *vma = vmf->vma; struct inode *inode = mapping->host; + pgtable_t pgtable = NULL; struct page *zero_page; spinlock_t *ptl; pmd_t pmd_entry; @@ -1422,12 +1425,22 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, DAX_PMD | DAX_ZERO_PAGE, false); + if (arch_needs_pgtable_deposit()) { + pgtable = pte_alloc_one(vma->vm_mm); + if (!pgtable) + return VM_FAULT_OOM; + } + ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); if (!pmd_none(*(vmf->pmd))) { spin_unlock(ptl); goto fallback; } + if (pgtable) { + pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); + mm_inc_nr_ptes(vma->vm_mm); + } pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot); pmd_entry = pmd_mkhuge(pmd_entry); set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); @@ -1436,6 +1449,8 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, return VM_FAULT_NOPAGE; fallback: + if (pgtable) + pte_free(vma->vm_mm, pgtable); trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry); return VM_FAULT_FALLBACK; } diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 4fce1da7db23..ddd708b09fa1 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -394,12 +394,11 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n"); * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %NULL will be returned. + * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * returned. * - * If debugfs is not enabled in the kernel, the value -%ENODEV will be - * returned. It is not wise to check for this value, but rather, check for - * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling - * code. + * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will + * be returned. */ struct dentry *debugfs_create_u8(const char *name, umode_t mode, struct dentry *parent, u8 *value) @@ -440,12 +439,11 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n"); * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %NULL will be returned. + * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * returned. * - * If debugfs is not enabled in the kernel, the value -%ENODEV will be - * returned. It is not wise to check for this value, but rather, check for - * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling - * code. + * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will + * be returned. */ struct dentry *debugfs_create_u16(const char *name, umode_t mode, struct dentry *parent, u16 *value) @@ -486,12 +484,11 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n"); * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %NULL will be returned. + * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * returned. * - * If debugfs is not enabled in the kernel, the value -%ENODEV will be - * returned. It is not wise to check for this value, but rather, check for - * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling - * code. + * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will + * be returned. */ struct dentry *debugfs_create_u32(const char *name, umode_t mode, struct dentry *parent, u32 *value) @@ -533,12 +530,11 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n"); * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %NULL will be returned. + * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * returned. * - * If debugfs is not enabled in the kernel, the value -%ENODEV will be - * returned. It is not wise to check for this value, but rather, check for - * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling - * code. + * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will + * be returned. */ struct dentry *debugfs_create_u64(const char *name, umode_t mode, struct dentry *parent, u64 *value) @@ -582,12 +578,11 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n"); * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %NULL will be returned. + * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * returned. * - * If debugfs is not enabled in the kernel, the value -%ENODEV will be - * returned. It is not wise to check for this value, but rather, check for - * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling - * code. + * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will + * be returned. */ struct dentry *debugfs_create_ulong(const char *name, umode_t mode, struct dentry *parent, unsigned long *value) @@ -850,12 +845,11 @@ static const struct file_operations fops_bool_wo = { * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %NULL will be returned. + * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * returned. * - * If debugfs is not enabled in the kernel, the value -%ENODEV will be - * returned. It is not wise to check for this value, but rather, check for - * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling - * code. + * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will + * be returned. */ struct dentry *debugfs_create_bool(const char *name, umode_t mode, struct dentry *parent, bool *value) @@ -904,12 +898,11 @@ static const struct file_operations fops_blob = { * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %NULL will be returned. + * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * returned. * - * If debugfs is not enabled in the kernel, the value -%ENODEV will be - * returned. It is not wise to check for this value, but rather, check for - * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling - * code. + * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will + * be returned. */ struct dentry *debugfs_create_blob(const char *name, umode_t mode, struct dentry *parent, @@ -1005,8 +998,9 @@ static const struct file_operations u32_array_fops = { * Writing is not supported. Seek within the file is also not supported. * Once array is created its size can not be changed. * - * The function returns a pointer to dentry on success. If debugfs is not - * enabled in the kernel, the value -%ENODEV will be returned. + * The function returns a pointer to dentry on success. If an error occurs, + * %ERR_PTR(-ERROR) or NULL will be returned. If debugfs is not enabled in + * the kernel, the value %ERR_PTR(-ENODEV) will be returned. */ struct dentry *debugfs_create_u32_array(const char *name, umode_t mode, struct dentry *parent, @@ -1102,12 +1096,11 @@ static const struct file_operations fops_regset32 = { * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %NULL will be returned. + * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * returned. * - * If debugfs is not enabled in the kernel, the value -%ENODEV will be - * returned. It is not wise to check for this value, but rather, check for - * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling - * code. + * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will + * be returned. */ struct dentry *debugfs_create_regset32(const char *name, umode_t mode, struct dentry *parent, diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 8b8225211a14..acef14ad53db 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -163,19 +163,18 @@ static int debugfs_show_options(struct seq_file *m, struct dentry *root) return 0; } -static void debugfs_evict_inode(struct inode *inode) +static void debugfs_free_inode(struct inode *inode) { - truncate_inode_pages_final(&inode->i_data); - clear_inode(inode); if (S_ISLNK(inode->i_mode)) kfree(inode->i_link); + free_inode_nonrcu(inode); } static const struct super_operations debugfs_super_operations = { .statfs = simple_statfs, .remount_fs = debugfs_remount, .show_options = debugfs_show_options, - .evict_inode = debugfs_evict_inode, + .free_inode = debugfs_free_inode, }; static void debugfs_release_dentry(struct dentry *dentry) diff --git a/fs/direct-io.c b/fs/direct-io.c index 9bb015bc4a83..fbe885d68035 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -538,7 +538,6 @@ static struct bio *dio_await_one(struct dio *dio) static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio) { struct bio_vec *bvec; - unsigned i; blk_status_t err = bio->bi_status; if (err) { @@ -553,7 +552,7 @@ static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio) } else { struct bvec_iter_all iter_all; - bio_for_each_segment_all(bvec, bio, i, iter_all) { + bio_for_each_segment_all(bvec, bio, iter_all) { struct page *page = bvec->bv_page; if (dio->op == REQ_OP_READ && !PageCompound(page) && diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index f664da55234e..491cf5baa8c2 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -68,7 +68,6 @@ static int ecryptfs_hash_digest(struct crypto_shash *tfm, int err; desc->tfm = tfm; - desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; err = crypto_shash_digest(desc, src, len, dst); shash_desc_zero(desc); return err; diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index e74fe84d0886..90fbac5d485b 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -769,7 +769,6 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, } s->hash_desc->tfm = s->hash_tfm; - s->hash_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; rc = crypto_shash_digest(s->hash_desc, (u8 *)s->auth_tok->token.password.session_key_encryption_key, diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index 85411ceb0508..c3e511f2b6c0 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -67,9 +67,8 @@ out: return inode; } -static void ecryptfs_i_callback(struct rcu_head *head) +static void ecryptfs_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); struct ecryptfs_inode_info *inode_info; inode_info = ecryptfs_inode_to_private(inode); @@ -92,7 +91,6 @@ static void ecryptfs_destroy_inode(struct inode *inode) inode_info = ecryptfs_inode_to_private(inode); BUG_ON(inode_info->lower_file); ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat); - call_rcu(&inode->i_rcu, ecryptfs_i_callback); } /** @@ -186,6 +184,7 @@ static int ecryptfs_show_options(struct seq_file *m, struct dentry *root) const struct super_operations ecryptfs_sops = { .alloc_inode = ecryptfs_alloc_inode, .destroy_inode = ecryptfs_destroy_inode, + .free_inode = ecryptfs_free_inode, .statfs = ecryptfs_statfs, .remount_fs = NULL, .evict_inode = ecryptfs_evict_inode, diff --git a/fs/efs/super.c b/fs/efs/super.c index 6ffb7ba1547a..867fc24dee20 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -74,17 +74,11 @@ static struct inode *efs_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void efs_i_callback(struct rcu_head *head) +static void efs_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(efs_inode_cachep, INODE_INFO(inode)); } -static void efs_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, efs_i_callback); -} - static void init_once(void *foo) { struct efs_inode_info *ei = (struct efs_inode_info *) foo; @@ -122,7 +116,7 @@ static int efs_remount(struct super_block *sb, int *flags, char *data) static const struct super_operations efs_superblock_operations = { .alloc_inode = efs_alloc_inode, - .destroy_inode = efs_destroy_inode, + .free_inode = efs_free_inode, .statfs = efs_statfs, .remount_fs = efs_remount, }; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 0128010a0874..3988633789cb 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -192,17 +192,11 @@ static struct inode *ext2_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void ext2_i_callback(struct rcu_head *head) +static void ext2_free_in_core_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(ext2_inode_cachep, EXT2_I(inode)); } -static void ext2_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, ext2_i_callback); -} - static void init_once(void *foo) { struct ext2_inode_info *ei = (struct ext2_inode_info *) foo; @@ -351,7 +345,7 @@ static const struct quotactl_ops ext2_quotactl_ops = { static const struct super_operations ext2_sops = { .alloc_inode = ext2_alloc_inode, - .destroy_inode = ext2_destroy_inode, + .free_inode = ext2_free_in_core_inode, .write_inode = ext2_write_inode, .evict_inode = ext2_evict_inode, .put_super = ext2_put_super, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 82ffdacdc7fa..0833b5fc0668 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2024,7 +2024,6 @@ static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc, BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver)!=sizeof(desc.ctx)); desc.shash.tfm = sbi->s_chksum_driver; - desc.shash.flags = 0; *(u32 *)desc.ctx = crc; BUG_ON(crypto_shash_update(&desc.shash, address, length)); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 3e9298e6a705..4690618a92e9 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -61,11 +61,10 @@ static void buffer_io_error(struct buffer_head *bh) static void ext4_finish_bio(struct bio *bio) { - int i; struct bio_vec *bvec; struct bvec_iter_all iter_all; - bio_for_each_segment_all(bvec, bio, i, iter_all) { + bio_for_each_segment_all(bvec, bio, iter_all) { struct page *page = bvec->bv_page; #ifdef CONFIG_FS_ENCRYPTION struct page *data_page = NULL; diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 3adadf461825..3629a74b7f94 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -71,7 +71,6 @@ static inline bool ext4_bio_encrypted(struct bio *bio) static void mpage_end_io(struct bio *bio) { struct bio_vec *bv; - int i; struct bvec_iter_all iter_all; if (ext4_bio_encrypted(bio)) { @@ -82,7 +81,7 @@ static void mpage_end_io(struct bio *bio) return; } } - bio_for_each_segment_all(bv, bio, i, iter_all) { + bio_for_each_segment_all(bv, bio, iter_all) { struct page *page = bv->bv_page; if (!bio->bi_status) { diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index e7ae26e36c9c..38faf661e237 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1760,8 +1760,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_msg(sb, KERN_ERR, "filesystem too large to resize to %llu blocks safely", n_blocks_count); - if (sizeof(sector_t) < 8) - ext4_warning(sb, "CONFIG_LBDAF not enabled"); return -EINVAL; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6ed4eb81e674..0e63069b9d5b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1107,9 +1107,8 @@ static int ext4_drop_inode(struct inode *inode) return drop; } -static void ext4_i_callback(struct rcu_head *head) +static void ext4_free_in_core_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); } @@ -1124,7 +1123,6 @@ static void ext4_destroy_inode(struct inode *inode) true); dump_stack(); } - call_rcu(&inode->i_rcu, ext4_i_callback); } static void init_once(void *foo) @@ -1402,6 +1400,7 @@ static const struct quotactl_ops ext4_qctl_operations = { static const struct super_operations ext4_sops = { .alloc_inode = ext4_alloc_inode, + .free_inode = ext4_free_in_core_inode, .destroy_inode = ext4_destroy_inode, .write_inode = ext4_write_inode, .dirty_inode = ext4_dirty_inode, @@ -2706,13 +2705,9 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files) loff_t res; loff_t upper_limit = MAX_LFS_FILESIZE; - /* small i_blocks in vfs inode? */ - if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { - /* - * CONFIG_LBDAF is not enabled implies the inode - * i_block represent total blocks in 512 bytes - * 32 == size of vfs inode i_blocks * 8 - */ + BUILD_BUG_ON(sizeof(blkcnt_t) < sizeof(u64)); + + if (!has_huge_files) { upper_limit = (1LL << 32) - 1; /* total blocks in file system block size */ @@ -2753,11 +2748,11 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) * number of 512-byte sectors of the file. */ - if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { + if (!has_huge_files) { /* - * !has_huge_files or CONFIG_LBDAF not enabled implies that - * the inode i_block field represents total file blocks in - * 2^32 512-byte sectors == size of vfs inode i_blocks * 8 + * !has_huge_files or implies that the inode i_block field + * represents total file blocks in 2^32 512-byte sectors == + * size of vfs inode i_blocks * 8 */ upper_limit = (1LL << 32) - 1; @@ -2897,18 +2892,6 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly) ~EXT4_FEATURE_RO_COMPAT_SUPP)); return 0; } - /* - * Large file size enabled file system can only be mounted - * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF - */ - if (ext4_has_feature_huge_file(sb)) { - if (sizeof(blkcnt_t) < sizeof(u64)) { - ext4_msg(sb, KERN_ERR, "Filesystem with huge files " - "cannot be mounted RDWR without " - "CONFIG_LBDAF"); - return 0; - } - } if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) { ext4_msg(sb, KERN_ERR, "Can't support bigalloc feature without " @@ -4057,8 +4040,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (err) { ext4_msg(sb, KERN_ERR, "filesystem" " too large to mount safely on this system"); - if (sizeof(sector_t) < 8) - ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); goto failed_mount; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9727944139f2..64040e998439 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -86,10 +86,9 @@ static void __read_end_io(struct bio *bio) { struct page *page; struct bio_vec *bv; - int i; struct bvec_iter_all iter_all; - bio_for_each_segment_all(bv, bio, i, iter_all) { + bio_for_each_segment_all(bv, bio, iter_all) { page = bv->bv_page; /* PG_error was set if any post_read step failed */ @@ -164,7 +163,6 @@ static void f2fs_write_end_io(struct bio *bio) { struct f2fs_sb_info *sbi = bio->bi_private; struct bio_vec *bvec; - int i; struct bvec_iter_all iter_all; if (time_to_inject(sbi, FAULT_WRITE_IO)) { @@ -172,7 +170,7 @@ static void f2fs_write_end_io(struct bio *bio) bio->bi_status = BLK_STS_IOERR; } - bio_for_each_segment_all(bvec, bio, i, iter_all) { + bio_for_each_segment_all(bvec, bio, iter_all) { struct page *page = bvec->bv_page; enum count_type type = WB_DATA_TYPE(page); @@ -349,7 +347,6 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, { struct bio_vec *bvec; struct page *target; - int i; struct bvec_iter_all iter_all; if (!io->bio) @@ -358,7 +355,7 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, if (!inode && !page && !ino) return true; - bio_for_each_segment_all(bvec, io->bio, i, iter_all) { + bio_for_each_segment_all(bvec, io->bio, iter_all) { if (bvec->bv_page->mapping) target = bvec->bv_page; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 87f75ebd2fd6..bacf5c2a8850 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1338,7 +1338,7 @@ struct f2fs_private_dio { #ifdef CONFIG_F2FS_FAULT_INJECTION #define f2fs_show_injection_info(type) \ - printk_ratelimited("%sF2FS-fs : inject %s in %s of %pF\n", \ + printk_ratelimited("%sF2FS-fs : inject %s in %s of %pS\n", \ KERN_INFO, f2fs_fault_name[type], \ __func__, __builtin_return_address(0)) static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) @@ -1422,7 +1422,6 @@ static inline u32 __f2fs_crc32(struct f2fs_sb_info *sbi, u32 crc, BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver) != sizeof(desc.ctx)); desc.shash.tfm = sbi->s_chksum_driver; - desc.shash.flags = 0; *(u32 *)desc.ctx = crc; err = crypto_shash_update(&desc.shash, address, length); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 3f99ab288695..d6e48a6487d5 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -616,8 +616,10 @@ pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs) switch (dn->max_level) { case 3: base += 2 * indirect_blks; + /* fall through */ case 2: base += 2 * direct_blks; + /* fall through */ case 1: base += direct_index; break; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f2aaa2cc6b3e..9924eac76254 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1000,17 +1000,11 @@ static void f2fs_dirty_inode(struct inode *inode, int flags) f2fs_inode_dirtied(inode, false); } -static void f2fs_i_callback(struct rcu_head *head) +static void f2fs_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(f2fs_inode_cachep, F2FS_I(inode)); } -static void f2fs_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, f2fs_i_callback); -} - static void destroy_percpu_info(struct f2fs_sb_info *sbi) { percpu_counter_destroy(&sbi->alloc_valid_block_count); @@ -2166,8 +2160,8 @@ void f2fs_quota_off_umount(struct super_block *sb) static const struct super_operations f2fs_sops = { .alloc_inode = f2fs_alloc_inode, + .free_inode = f2fs_free_inode, .drop_inode = f2fs_drop_inode, - .destroy_inode = f2fs_destroy_inode, .write_inode = f2fs_write_inode, .dirty_inode = f2fs_dirty_inode, .show_options = f2fs_show_options, diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 79bb0e73a65f..ba93d1373306 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -746,17 +746,11 @@ static struct inode *fat_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void fat_i_callback(struct rcu_head *head) +static void fat_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(fat_inode_cachep, MSDOS_I(inode)); } -static void fat_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, fat_i_callback); -} - static void init_once(void *foo) { struct msdos_inode_info *ei = (struct msdos_inode_info *)foo; @@ -920,7 +914,7 @@ EXPORT_SYMBOL_GPL(fat_sync_inode); static int fat_show_options(struct seq_file *m, struct dentry *root); static const struct super_operations fat_sops = { .alloc_inode = fat_alloc_inode, - .destroy_inode = fat_destroy_inode, + .free_inode = fat_free_inode, .write_inode = fat_write_inode, .evict_inode = fat_evict_inode, .put_super = fat_put_super, diff --git a/fs/fcntl.c b/fs/fcntl.c index 083185174c6d..3d40771e8e7c 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -769,7 +769,7 @@ static void send_sigio_to_task(struct task_struct *p, si.si_fd = fd; if (!do_send_sig_info(signum, &si, p, type)) break; - /* fall-through: fall back on the old plain SIGIO signal */ + /* fall-through - fall back on the old plain SIGIO signal */ case 0: do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, type); } diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index 48b24bb50d02..a89f68c3cbed 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -131,21 +131,14 @@ static struct inode *vxfs_alloc_inode(struct super_block *sb) return &vi->vfs_inode; } -static void vxfs_i_callback(struct rcu_head *head) +static void vxfs_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); - kmem_cache_free(vxfs_inode_cachep, VXFS_INO(inode)); } -static void vxfs_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, vxfs_i_callback); -} - static const struct super_operations vxfs_super_ops = { .alloc_inode = vxfs_alloc_inode, - .destroy_inode = vxfs_destroy_inode, + .free_inode = vxfs_free_inode, .evict_inode = vxfs_evict_inode, .put_super = vxfs_put_super, .statfs = vxfs_statfs, diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 8a63e52785e9..9971a35cf1ef 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2056,10 +2056,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len; ret = -EINVAL; - if (rem < len) { - pipe_unlock(pipe); - goto out; - } + if (rem < len) + goto out_free; rem = len; while (rem) { @@ -2077,7 +2075,9 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); pipe->nrbufs--; } else { - pipe_buf_get(pipe, ibuf); + if (!pipe_buf_get(pipe, ibuf)) + goto out_free; + *obuf = *ibuf; obuf->flags &= ~PIPE_BUF_FLAG_GIFT; obuf->len = rem; @@ -2100,11 +2100,11 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, ret = fuse_dev_do_write(fud, &cs, len); pipe_lock(pipe); +out_free: for (idx = 0; idx < nbuf; idx++) pipe_buf_release(pipe, &bufs[idx]); pipe_unlock(pipe); -out: kvfree(bufs); return ret; } diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index ec5d9953dfb6..f485d09d14df 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -107,34 +107,30 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) return inode; } -static void fuse_i_callback(struct rcu_head *head) -{ - struct inode *inode = container_of(head, struct inode, i_rcu); - kmem_cache_free(fuse_inode_cachep, inode); -} - -static void fuse_destroy_inode(struct inode *inode) +static void fuse_free_inode(struct inode *inode) { struct fuse_inode *fi = get_fuse_inode(inode); - if (S_ISREG(inode->i_mode) && !is_bad_inode(inode)) { - WARN_ON(!list_empty(&fi->write_files)); - WARN_ON(!list_empty(&fi->queued_writes)); - } + mutex_destroy(&fi->mutex); kfree(fi->forget); - call_rcu(&inode->i_rcu, fuse_i_callback); + kmem_cache_free(fuse_inode_cachep, fi); } static void fuse_evict_inode(struct inode *inode) { + struct fuse_inode *fi = get_fuse_inode(inode); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (inode->i_sb->s_flags & SB_ACTIVE) { struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_inode *fi = get_fuse_inode(inode); fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup); fi->forget = NULL; } + if (S_ISREG(inode->i_mode) && !is_bad_inode(inode)) { + WARN_ON(!list_empty(&fi->write_files)); + WARN_ON(!list_empty(&fi->queued_writes)); + } } static int fuse_remount_fs(struct super_block *sb, int *flags, char *data) @@ -814,7 +810,7 @@ static const struct export_operations fuse_export_operations = { static const struct super_operations fuse_super_operations = { .alloc_inode = fuse_alloc_inode, - .destroy_inode = fuse_destroy_inode, + .free_inode = fuse_free_inode, .evict_inode = fuse_evict_inode, .write_inode = fuse_write_inode, .drop_inode = generic_delete_inode, diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index 3ed2b088dcfd..6a1e499543f5 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -1,6 +1,5 @@ config GFS2_FS tristate "GFS2 file system support" - depends on (64BIT || LBDAF) select FS_POSIX_ACL select CRC32 select LIBCRC32C diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 02b2646d84b3..2f9290f69610 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -710,7 +710,7 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap, } if (n == 0) break; - /* Branching from existing tree */ + /* fall through - To branching from existing tree */ case ALLOC_GROW_DEPTH: if (i > 1 && i < mp->mp_fheight) gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]); @@ -721,7 +721,7 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap, state = ALLOC_DATA; if (n == 0) break; - /* Tree complete, adding data blocks */ + /* fall through - To tree complete, adding data blocks */ case ALLOC_DATA: BUG_ON(n > dblks); BUG_ON(mp->mp_bh[end_of_metadata] == NULL); @@ -965,15 +965,20 @@ static void gfs2_write_unlock(struct inode *inode) gfs2_glock_dq_uninit(&ip->i_gh); } -static void gfs2_iomap_journaled_page_done(struct inode *inode, loff_t pos, - unsigned copied, struct page *page, - struct iomap *iomap) +static void gfs2_iomap_page_done(struct inode *inode, loff_t pos, + unsigned copied, struct page *page, + struct iomap *iomap) { struct gfs2_inode *ip = GFS2_I(inode); - gfs2_page_add_databufs(ip, page, offset_in_page(pos), copied); + if (page) + gfs2_page_add_databufs(ip, page, offset_in_page(pos), copied); } +static const struct iomap_page_ops gfs2_iomap_page_ops = { + .page_done = gfs2_iomap_page_done, +}; + static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, loff_t length, unsigned flags, struct iomap *iomap, @@ -1051,7 +1056,7 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, } } if (!gfs2_is_stuffed(ip) && gfs2_is_jdata(ip)) - iomap->page_done = gfs2_iomap_journaled_page_done; + iomap->page_ops = &gfs2_iomap_page_ops; return 0; out_trans_end: diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 8722c60b11fe..6f09b5e3dd6e 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -207,7 +207,6 @@ static void gfs2_end_log_write(struct bio *bio) struct gfs2_sbd *sdp = bio->bi_private; struct bio_vec *bvec; struct page *page; - int i; struct bvec_iter_all iter_all; if (bio->bi_status) { @@ -216,7 +215,7 @@ static void gfs2_end_log_write(struct bio *bio) wake_up(&sdp->sd_logd_waitq); } - bio_for_each_segment_all(bvec, bio, i, iter_all) { + bio_for_each_segment_all(bvec, bio, iter_all) { page = bvec->bv_page; if (page_has_buffers(page)) gfs2_end_log_write_bh(sdp, bvec, bio->bi_status); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 3201342404a7..ff86e1d4f8ff 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -189,10 +189,9 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno) static void gfs2_meta_read_endio(struct bio *bio) { struct bio_vec *bvec; - int i; struct bvec_iter_all iter_all; - bio_for_each_segment_all(bvec, bio, i, iter_all) { + bio_for_each_segment_all(bvec, bio, iter_all) { struct page *page = bvec->bv_page; struct buffer_head *bh = page_buffers(page); unsigned int len = bvec->bv_len; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index ca71163ff7cf..7b8d2306b3d3 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1736,20 +1736,14 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb) return &ip->i_inode; } -static void gfs2_i_callback(struct rcu_head *head) +static void gfs2_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); - kmem_cache_free(gfs2_inode_cachep, inode); -} - -static void gfs2_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, gfs2_i_callback); + kmem_cache_free(gfs2_inode_cachep, GFS2_I(inode)); } const struct super_operations gfs2_super_ops = { .alloc_inode = gfs2_alloc_inode, - .destroy_inode = gfs2_destroy_inode, + .free_inode = gfs2_free_inode, .write_inode = gfs2_write_inode, .dirty_inode = gfs2_dirty_inode, .evict_inode = gfs2_evict_inode, diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 173876782f73..c33324686d89 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -167,20 +167,14 @@ static struct inode *hfs_alloc_inode(struct super_block *sb) return i ? &i->vfs_inode : NULL; } -static void hfs_i_callback(struct rcu_head *head) +static void hfs_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(hfs_inode_cachep, HFS_I(inode)); } -static void hfs_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, hfs_i_callback); -} - static const struct super_operations hfs_super_operations = { .alloc_inode = hfs_alloc_inode, - .destroy_inode = hfs_destroy_inode, + .free_inode = hfs_free_inode, .write_inode = hfs_write_inode, .evict_inode = hfs_evict_inode, .put_super = hfs_put_super, diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index eb4535eba95d..0cc5feff76cd 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -18,7 +18,7 @@ #include <linux/nls.h> static struct inode *hfsplus_alloc_inode(struct super_block *sb); -static void hfsplus_destroy_inode(struct inode *inode); +static void hfsplus_free_inode(struct inode *inode); #include "hfsplus_fs.h" #include "xattr.h" @@ -361,7 +361,7 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data) static const struct super_operations hfsplus_sops = { .alloc_inode = hfsplus_alloc_inode, - .destroy_inode = hfsplus_destroy_inode, + .free_inode = hfsplus_free_inode, .write_inode = hfsplus_write_inode, .evict_inode = hfsplus_evict_inode, .put_super = hfsplus_put_super, @@ -628,18 +628,11 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb) return i ? &i->vfs_inode : NULL; } -static void hfsplus_i_callback(struct rcu_head *head) +static void hfsplus_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); - kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode)); } -static void hfsplus_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, hfsplus_i_callback); -} - #define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info) static struct dentry *hfsplus_mount(struct file_system_type *fs_type, diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 444c7b170359..5a7eb0c79839 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -243,17 +243,11 @@ static void hostfs_evict_inode(struct inode *inode) } } -static void hostfs_i_callback(struct rcu_head *head) +static void hostfs_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kfree(HOSTFS_I(inode)); } -static void hostfs_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, hostfs_i_callback); -} - static int hostfs_show_options(struct seq_file *seq, struct dentry *root) { const char *root_path = root->d_sb->s_fs_info; @@ -270,7 +264,7 @@ static int hostfs_show_options(struct seq_file *seq, struct dentry *root) static const struct super_operations hostfs_sbops = { .alloc_inode = hostfs_alloc_inode, - .destroy_inode = hostfs_destroy_inode, + .free_inode = hostfs_free_inode, .evict_inode = hostfs_evict_inode, .statfs = hostfs_statfs, .show_options = hostfs_show_options, diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index f2c3ebcd309c..ed4264bca790 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -238,17 +238,11 @@ static struct inode *hpfs_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void hpfs_i_callback(struct rcu_head *head) +static void hpfs_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(hpfs_inode_cachep, hpfs_i(inode)); } -static void hpfs_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, hpfs_i_callback); -} - static void init_once(void *foo) { struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo; @@ -532,7 +526,7 @@ static int hpfs_show_options(struct seq_file *seq, struct dentry *root) static const struct super_operations hpfs_sops = { .alloc_inode = hpfs_alloc_inode, - .destroy_inode = hpfs_destroy_inode, + .free_inode = hpfs_free_inode, .evict_inode = hpfs_evict_inode, .put_super = hpfs_put_super, .statfs = hpfs_statfs, diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index ec32fece5e1e..c74ef4426282 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -755,11 +755,17 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, umode_t mode, dev_t dev) { struct inode *inode; - struct resv_map *resv_map; + struct resv_map *resv_map = NULL; - resv_map = resv_map_alloc(); - if (!resv_map) - return NULL; + /* + * Reserve maps are only needed for inodes that can have associated + * page allocations. + */ + if (S_ISREG(mode) || S_ISLNK(mode)) { + resv_map = resv_map_alloc(); + if (!resv_map) + return NULL; + } inode = new_inode(sb); if (inode) { @@ -794,8 +800,10 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, break; } lockdep_annotate_inode_mutex_key(inode); - } else - kref_put(&resv_map->refs, resv_map_release); + } else { + if (resv_map) + kref_put(&resv_map->refs, resv_map_release); + } return inode; } @@ -1043,9 +1051,8 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) return &p->vfs_inode; } -static void hugetlbfs_i_callback(struct rcu_head *head) +static void hugetlbfs_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); } @@ -1053,7 +1060,6 @@ static void hugetlbfs_destroy_inode(struct inode *inode) { hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); - call_rcu(&inode->i_rcu, hugetlbfs_i_callback); } static const struct address_space_operations hugetlbfs_aops = { @@ -1100,6 +1106,7 @@ static const struct inode_operations hugetlbfs_inode_operations = { static const struct super_operations hugetlbfs_ops = { .alloc_inode = hugetlbfs_alloc_inode, + .free_inode = hugetlbfs_free_inode, .destroy_inode = hugetlbfs_destroy_inode, .evict_inode = hugetlbfs_evict_inode, .statfs = hugetlbfs_statfs, diff --git a/fs/inode.c b/fs/inode.c index e9d97add2b36..16b10e53292e 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -202,12 +202,28 @@ out: } EXPORT_SYMBOL(inode_init_always); +void free_inode_nonrcu(struct inode *inode) +{ + kmem_cache_free(inode_cachep, inode); +} +EXPORT_SYMBOL(free_inode_nonrcu); + +static void i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + if (inode->free_inode) + inode->free_inode(inode); + else + free_inode_nonrcu(inode); +} + static struct inode *alloc_inode(struct super_block *sb) { + const struct super_operations *ops = sb->s_op; struct inode *inode; - if (sb->s_op->alloc_inode) - inode = sb->s_op->alloc_inode(sb); + if (ops->alloc_inode) + inode = ops->alloc_inode(sb); else inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL); @@ -215,22 +231,19 @@ static struct inode *alloc_inode(struct super_block *sb) return NULL; if (unlikely(inode_init_always(sb, inode))) { - if (inode->i_sb->s_op->destroy_inode) - inode->i_sb->s_op->destroy_inode(inode); - else - kmem_cache_free(inode_cachep, inode); + if (ops->destroy_inode) { + ops->destroy_inode(inode); + if (!ops->free_inode) + return NULL; + } + inode->free_inode = ops->free_inode; + i_callback(&inode->i_rcu); return NULL; } return inode; } -void free_inode_nonrcu(struct inode *inode) -{ - kmem_cache_free(inode_cachep, inode); -} -EXPORT_SYMBOL(free_inode_nonrcu); - void __destroy_inode(struct inode *inode) { BUG_ON(inode_has_buffers(inode)); @@ -253,20 +266,19 @@ void __destroy_inode(struct inode *inode) } EXPORT_SYMBOL(__destroy_inode); -static void i_callback(struct rcu_head *head) -{ - struct inode *inode = container_of(head, struct inode, i_rcu); - kmem_cache_free(inode_cachep, inode); -} - static void destroy_inode(struct inode *inode) { + const struct super_operations *ops = inode->i_sb->s_op; + BUG_ON(!list_empty(&inode->i_lru)); __destroy_inode(inode); - if (inode->i_sb->s_op->destroy_inode) - inode->i_sb->s_op->destroy_inode(inode); - else - call_rcu(&inode->i_rcu, i_callback); + if (ops->destroy_inode) { + ops->destroy_inode(inode); + if (!ops->free_inode) + return; + } + inode->free_inode = ops->free_inode; + call_rcu(&inode->i_rcu, i_callback); } /** @@ -1817,8 +1829,13 @@ int file_remove_privs(struct file *file) int kill; int error = 0; - /* Fast path for nothing security related */ - if (IS_NOSEC(inode)) + /* + * Fast path for nothing security related. + * As well for non-regular files, e.g. blkdev inodes. + * For example, blkdev_write_iter() might get here + * trying to remove privs which it is not allowed to. + */ + if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode)) return 0; kill = dentry_needs_remove_privs(dentry); diff --git a/fs/internal.h b/fs/internal.h index 8102032432cf..17a8ae967493 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -44,7 +44,7 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait) extern void guard_bio_eod(int rw, struct bio *bio); extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, get_block_t *get_block, struct iomap *iomap); -int __generic_write_end(struct inode *inode, loff_t pos, unsigned copied, +void __generic_write_end(struct inode *inode, loff_t pos, unsigned copied, struct page *page); /* diff --git a/fs/io_uring.c b/fs/io_uring.c index bbdbd56cf2ac..48ea3977012a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4,15 +4,28 @@ * supporting fast/efficient IO. * * A note on the read/write ordering memory barriers that are matched between - * the application and kernel side. When the application reads the CQ ring - * tail, it must use an appropriate smp_rmb() to order with the smp_wmb() - * the kernel uses after writing the tail. Failure to do so could cause a - * delay in when the application notices that completion events available. - * This isn't a fatal condition. Likewise, the application must use an - * appropriate smp_wmb() both before writing the SQ tail, and after writing - * the SQ tail. The first one orders the sqe writes with the tail write, and - * the latter is paired with the smp_rmb() the kernel will issue before - * reading the SQ tail on submission. + * the application and kernel side. + * + * After the application reads the CQ ring tail, it must use an + * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses + * before writing the tail (using smp_load_acquire to read the tail will + * do). It also needs a smp_mb() before updating CQ head (ordering the + * entry load(s) with the head store), pairing with an implicit barrier + * through a control-dependency in io_get_cqring (smp_store_release to + * store head will do). Failure to do so could lead to reading invalid + * CQ entries. + * + * Likewise, the application must use an appropriate smp_wmb() before + * writing the SQ tail (ordering SQ entry stores with the tail store), + * which pairs with smp_load_acquire in io_get_sqring (smp_store_release + * to store the tail will do). And it needs a barrier ordering the SQ + * head load before writing new SQ entries (smp_load_acquire to read + * head will do). + * + * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application + * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after* + * updating the SQ tail; a full memory barrier smp_mb() is needed + * between. * * Also see the examples in the liburing library: * @@ -70,20 +83,108 @@ struct io_uring { u32 tail ____cacheline_aligned_in_smp; }; +/* + * This data is shared with the application through the mmap at offset + * IORING_OFF_SQ_RING. + * + * The offsets to the member fields are published through struct + * io_sqring_offsets when calling io_uring_setup. + */ struct io_sq_ring { + /* + * Head and tail offsets into the ring; the offsets need to be + * masked to get valid indices. + * + * The kernel controls head and the application controls tail. + */ struct io_uring r; + /* + * Bitmask to apply to head and tail offsets (constant, equals + * ring_entries - 1) + */ u32 ring_mask; + /* Ring size (constant, power of 2) */ u32 ring_entries; + /* + * Number of invalid entries dropped by the kernel due to + * invalid index stored in array + * + * Written by the kernel, shouldn't be modified by the + * application (i.e. get number of "new events" by comparing to + * cached value). + * + * After a new SQ head value was read by the application this + * counter includes all submissions that were dropped reaching + * the new SQ head (and possibly more). + */ u32 dropped; + /* + * Runtime flags + * + * Written by the kernel, shouldn't be modified by the + * application. + * + * The application needs a full memory barrier before checking + * for IORING_SQ_NEED_WAKEUP after updating the sq tail. + */ u32 flags; + /* + * Ring buffer of indices into array of io_uring_sqe, which is + * mmapped by the application using the IORING_OFF_SQES offset. + * + * This indirection could e.g. be used to assign fixed + * io_uring_sqe entries to operations and only submit them to + * the queue when needed. + * + * The kernel modifies neither the indices array nor the entries + * array. + */ u32 array[]; }; +/* + * This data is shared with the application through the mmap at offset + * IORING_OFF_CQ_RING. + * + * The offsets to the member fields are published through struct + * io_cqring_offsets when calling io_uring_setup. + */ struct io_cq_ring { + /* + * Head and tail offsets into the ring; the offsets need to be + * masked to get valid indices. + * + * The application controls head and the kernel tail. + */ struct io_uring r; + /* + * Bitmask to apply to head and tail offsets (constant, equals + * ring_entries - 1) + */ u32 ring_mask; + /* Ring size (constant, power of 2) */ u32 ring_entries; + /* + * Number of completion events lost because the queue was full; + * this should be avoided by the application by making sure + * there are not more requests pending thatn there is space in + * the completion queue. + * + * Written by the kernel, shouldn't be modified by the + * application (i.e. get number of "new events" by comparing to + * cached value). + * + * As completion events come in out of order this counter is not + * ordered with any other data. + */ u32 overflow; + /* + * Ring buffer of completion events. + * + * The kernel writes completion events fresh every time they are + * produced, so the application is allowed to modify pending + * entries. + */ struct io_uring_cqe cqes[]; }; @@ -121,6 +222,8 @@ struct io_ring_ctx { unsigned sq_mask; unsigned sq_thread_idle; struct io_uring_sqe *sq_sqes; + + struct list_head defer_list; } ____cacheline_aligned_in_smp; /* IO offload */ @@ -138,6 +241,7 @@ struct io_ring_ctx { unsigned cq_mask; struct wait_queue_head cq_wait; struct fasync_struct *cq_fasync; + struct eventfd_ctx *cq_ev_fd; } ____cacheline_aligned_in_smp; /* @@ -221,13 +325,16 @@ struct io_kiocb { struct list_head list; unsigned int flags; refcount_t refs; -#define REQ_F_FORCE_NONBLOCK 1 /* inline submission attempt */ +#define REQ_F_NOWAIT 1 /* must not punt to workers */ #define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ #define REQ_F_FIXED_FILE 4 /* ctx owns file */ #define REQ_F_SEQ_PREV 8 /* sequential with previous */ #define REQ_F_PREPPED 16 /* prep already done */ +#define REQ_F_IO_DRAIN 32 /* drain existing IO first */ +#define REQ_F_IO_DRAINED 64 /* drain done */ u64 user_data; - u64 error; + u32 error; /* iopoll result from callback */ + u32 sequence; struct work_struct work; }; @@ -255,6 +362,8 @@ struct io_submit_state { unsigned int ios_left; }; +static void io_sq_wq_submit_work(struct work_struct *work); + static struct kmem_cache *req_cachep; static const struct file_operations io_uring_fops; @@ -306,10 +415,36 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) spin_lock_init(&ctx->completion_lock); INIT_LIST_HEAD(&ctx->poll_list); INIT_LIST_HEAD(&ctx->cancel_list); + INIT_LIST_HEAD(&ctx->defer_list); return ctx; } -static void io_commit_cqring(struct io_ring_ctx *ctx) +static inline bool io_sequence_defer(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN) + return false; + + return req->sequence > ctx->cached_cq_tail + ctx->sq_ring->dropped; +} + +static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) +{ + struct io_kiocb *req; + + if (list_empty(&ctx->defer_list)) + return NULL; + + req = list_first_entry(&ctx->defer_list, struct io_kiocb, list); + if (!io_sequence_defer(ctx, req)) { + list_del_init(&req->list); + return req; + } + + return NULL; +} + +static void __io_commit_cqring(struct io_ring_ctx *ctx) { struct io_cq_ring *ring = ctx->cq_ring; @@ -317,12 +452,6 @@ static void io_commit_cqring(struct io_ring_ctx *ctx) /* order cqe stores with ring update */ smp_store_release(&ring->r.tail, ctx->cached_cq_tail); - /* - * Write sider barrier of tail update, app has read side. See - * comment at the top of this file. - */ - smp_wmb(); - if (wq_has_sleeper(&ctx->cq_wait)) { wake_up_interruptible(&ctx->cq_wait); kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); @@ -330,15 +459,30 @@ static void io_commit_cqring(struct io_ring_ctx *ctx) } } +static void io_commit_cqring(struct io_ring_ctx *ctx) +{ + struct io_kiocb *req; + + __io_commit_cqring(ctx); + + while ((req = io_get_deferred_req(ctx)) != NULL) { + req->flags |= REQ_F_IO_DRAINED; + queue_work(ctx->sqo_wq, &req->work); + } +} + static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) { struct io_cq_ring *ring = ctx->cq_ring; unsigned tail; tail = ctx->cached_cq_tail; - /* See comment at the top of the file */ - smp_rmb(); - if (tail + 1 == READ_ONCE(ring->r.head)) + /* + * writes to the cq entry need to come after reading head; the + * control dependency is enough as we're using WRITE_ONCE to + * fill the cq entry + */ + if (tail - READ_ONCE(ring->r.head) == ring->ring_entries) return NULL; ctx->cached_cq_tail++; @@ -373,6 +517,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) wake_up(&ctx->wait); if (waitqueue_active(&ctx->sqo_wait)) wake_up(&ctx->sqo_wait); + if (ctx->cq_ev_fd) + eventfd_signal(ctx->cq_ev_fd, 1); } static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data, @@ -682,11 +828,9 @@ static void io_iopoll_req_issued(struct io_kiocb *req) list_add_tail(&req->list, &ctx->poll_list); } -static void io_file_put(struct io_submit_state *state, struct file *file) +static void io_file_put(struct io_submit_state *state) { - if (!state) { - fput(file); - } else if (state->file) { + if (state->file) { int diff = state->has_refs - state->used_refs; if (diff) @@ -711,7 +855,7 @@ static struct file *io_file_get(struct io_submit_state *state, int fd) state->ios_left--; return state->file; } - io_file_put(state, NULL); + io_file_put(state); } state->file = fget_many(fd, state->ios_left); if (!state->file) @@ -742,7 +886,7 @@ static bool io_file_supports_async(struct file *file) } static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, - bool force_nonblock, struct io_submit_state *state) + bool force_nonblock) { const struct io_uring_sqe *sqe = s->sqe; struct io_ring_ctx *ctx = req->ctx; @@ -776,10 +920,14 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); if (unlikely(ret)) return ret; - if (force_nonblock) { + + /* don't allow async punt if RWF_NOWAIT was requested */ + if (kiocb->ki_flags & IOCB_NOWAIT) + req->flags |= REQ_F_NOWAIT; + + if (force_nonblock) kiocb->ki_flags |= IOCB_NOWAIT; - req->flags |= REQ_F_FORCE_NONBLOCK; - } + if (ctx->flags & IORING_SETUP_IOPOLL) { if (!(kiocb->ki_flags & IOCB_DIRECT) || !kiocb->ki_filp->f_op->iopoll) @@ -940,7 +1088,7 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len) } static int io_read(struct io_kiocb *req, const struct sqe_submit *s, - bool force_nonblock, struct io_submit_state *state) + bool force_nonblock) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw; @@ -949,7 +1097,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, size_t iov_count; int ret; - ret = io_prep_rw(req, s, force_nonblock, state); + ret = io_prep_rw(req, s, force_nonblock); if (ret) return ret; file = kiocb->ki_filp; @@ -987,7 +1135,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, } static int io_write(struct io_kiocb *req, const struct sqe_submit *s, - bool force_nonblock, struct io_submit_state *state) + bool force_nonblock) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw; @@ -996,7 +1144,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, size_t iov_count; int ret; - ret = io_prep_rw(req, s, force_nonblock, state); + ret = io_prep_rw(req, s, force_nonblock); if (ret) return ret; @@ -1122,6 +1270,54 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } +static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_ring_ctx *ctx = req->ctx; + int ret = 0; + + if (!req->file) + return -EBADF; + /* Prep already done (EAGAIN retry) */ + if (req->flags & REQ_F_PREPPED) + return 0; + + if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) + return -EINVAL; + + req->flags |= REQ_F_PREPPED; + return ret; +} + +static int io_sync_file_range(struct io_kiocb *req, + const struct io_uring_sqe *sqe, + bool force_nonblock) +{ + loff_t sqe_off; + loff_t sqe_len; + unsigned flags; + int ret; + + ret = io_prep_sfr(req, sqe); + if (ret) + return ret; + + /* sync_file_range always requires a blocking context */ + if (force_nonblock) + return -EAGAIN; + + sqe_off = READ_ONCE(sqe->off); + sqe_len = READ_ONCE(sqe->len); + flags = READ_ONCE(sqe->sync_range_flags); + + ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags); + + io_cqring_add_event(req->ctx, sqe->user_data, ret, 0); + io_put_req(req); + return 0; +} + static void io_poll_remove_one(struct io_kiocb *req) { struct io_poll_iocb *poll = &req->poll; @@ -1324,7 +1520,6 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) spin_unlock(&poll->head->lock); } if (mask) { /* no async, we'd stolen it */ - req->error = mangle_poll(mask); ipt.error = 0; io_poll_complete(ctx, req, mask); } @@ -1337,9 +1532,36 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) return ipt.error; } +static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_uring_sqe *sqe_copy; + + if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list)) + return 0; + + sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL); + if (!sqe_copy) + return -EAGAIN; + + spin_lock_irq(&ctx->completion_lock); + if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list)) { + spin_unlock_irq(&ctx->completion_lock); + kfree(sqe_copy); + return 0; + } + + memcpy(sqe_copy, sqe, sizeof(*sqe_copy)); + req->submit.sqe = sqe_copy; + + INIT_WORK(&req->work, io_sq_wq_submit_work); + list_add_tail(&req->list, &ctx->defer_list); + spin_unlock_irq(&ctx->completion_lock); + return -EIOCBQUEUED; +} + static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct sqe_submit *s, bool force_nonblock, - struct io_submit_state *state) + const struct sqe_submit *s, bool force_nonblock) { int ret, opcode; @@ -1355,18 +1577,18 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, case IORING_OP_READV: if (unlikely(s->sqe->buf_index)) return -EINVAL; - ret = io_read(req, s, force_nonblock, state); + ret = io_read(req, s, force_nonblock); break; case IORING_OP_WRITEV: if (unlikely(s->sqe->buf_index)) return -EINVAL; - ret = io_write(req, s, force_nonblock, state); + ret = io_write(req, s, force_nonblock); break; case IORING_OP_READ_FIXED: - ret = io_read(req, s, force_nonblock, state); + ret = io_read(req, s, force_nonblock); break; case IORING_OP_WRITE_FIXED: - ret = io_write(req, s, force_nonblock, state); + ret = io_write(req, s, force_nonblock); break; case IORING_OP_FSYNC: ret = io_fsync(req, s->sqe, force_nonblock); @@ -1377,6 +1599,9 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, case IORING_OP_POLL_REMOVE: ret = io_poll_remove(req, s->sqe); break; + case IORING_OP_SYNC_FILE_RANGE: + ret = io_sync_file_range(req, s->sqe, force_nonblock); + break; default: ret = -EINVAL; break; @@ -1439,8 +1664,7 @@ restart: struct sqe_submit *s = &req->submit; const struct io_uring_sqe *sqe = s->sqe; - /* Ensure we clear previously set forced non-block flag */ - req->flags &= ~REQ_F_FORCE_NONBLOCK; + /* Ensure we clear previously set non-block flag */ req->rw.ki_flags &= ~IOCB_NOWAIT; ret = 0; @@ -1459,7 +1683,7 @@ restart: s->has_user = cur_mm != NULL; s->needs_lock = true; do { - ret = __io_submit_sqe(ctx, req, s, false, NULL); + ret = __io_submit_sqe(ctx, req, s, false); /* * We can get EAGAIN for polled IO even though * we're forcing a sync submission from here, @@ -1470,10 +1694,11 @@ restart: break; cond_resched(); } while (1); - - /* drop submission reference */ - io_put_req(req); } + + /* drop submission reference */ + io_put_req(req); + if (ret) { io_cqring_add_event(ctx, sqe->user_data, ret, 0); io_put_req(req); @@ -1585,6 +1810,11 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s, flags = READ_ONCE(s->sqe->flags); fd = READ_ONCE(s->sqe->fd); + if (flags & IOSQE_IO_DRAIN) { + req->flags |= REQ_F_IO_DRAIN; + req->sequence = ctx->cached_sq_head - 1; + } + if (!io_op_needs_file(s->sqe)) { req->file = NULL; return 0; @@ -1614,7 +1844,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, int ret; /* enforce forwards compatibility on users */ - if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE)) + if (unlikely(s->sqe->flags & ~(IOSQE_FIXED_FILE | IOSQE_IO_DRAIN))) return -EINVAL; req = io_get_req(ctx, state); @@ -1625,8 +1855,15 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, if (unlikely(ret)) goto out; - ret = __io_submit_sqe(ctx, req, s, true, state); - if (ret == -EAGAIN) { + ret = io_req_defer(ctx, req, s->sqe); + if (ret) { + if (ret == -EIOCBQUEUED) + ret = 0; + return ret; + } + + ret = __io_submit_sqe(ctx, req, s, true); + if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { struct io_uring_sqe *sqe_copy; sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL); @@ -1671,7 +1908,7 @@ out: static void io_submit_state_end(struct io_submit_state *state) { blk_finish_plug(&state->plug); - io_file_put(state, NULL); + io_file_put(state); if (state->free_reqs) kmem_cache_free_bulk(req_cachep, state->free_reqs, &state->reqs[state->cur_req]); @@ -1700,24 +1937,10 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) * write new data to them. */ smp_store_release(&ring->r.head, ctx->cached_sq_head); - - /* - * write side barrier of head update, app has read side. See - * comment at the top of this file - */ - smp_wmb(); } } /* - * Undo last io_get_sqring() - */ -static void io_drop_sqring(struct io_ring_ctx *ctx) -{ - ctx->cached_sq_head--; -} - -/* * Fetch an sqe, if one is available. Note that s->sqe will point to memory * that is mapped by userspace. This means that care needs to be taken to * ensure that reads are stable, as we cannot rely on userspace always @@ -1739,9 +1962,8 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) * though the application is the one updating it. */ head = ctx->cached_sq_head; - /* See comment at the top of this file */ - smp_rmb(); - if (head == READ_ONCE(ring->r.tail)) + /* make sure SQ entry isn't read before tail */ + if (head == smp_load_acquire(&ring->r.tail)) return false; head = READ_ONCE(ring->array[head & ctx->sq_mask]); @@ -1755,8 +1977,6 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) /* drop invalid entries */ ctx->cached_sq_head++; ring->dropped++; - /* See comment at the top of this file */ - smp_wmb(); return false; } @@ -1866,7 +2086,8 @@ static int io_sq_thread(void *data) /* Tell userspace we may need a wakeup call */ ctx->sq_ring->flags |= IORING_SQ_NEED_WAKEUP; - smp_wmb(); + /* make sure to read SQ tail after writing flags */ + smp_mb(); if (!io_get_sqring(ctx, &sqes[0])) { if (kthread_should_stop()) { @@ -1879,13 +2100,11 @@ static int io_sq_thread(void *data) finish_wait(&ctx->sqo_wait, &wait); ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP; - smp_wmb(); continue; } finish_wait(&ctx->sqo_wait, &wait); ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP; - smp_wmb(); } i = 0; @@ -1920,13 +2139,17 @@ static int io_sq_thread(void *data) unuse_mm(cur_mm); mmput(cur_mm); } + + if (kthread_should_park()) + kthread_parkme(); + return 0; } static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) { struct io_submit_state state, *statep = NULL; - int i, ret = 0, submit = 0; + int i, submit = 0; if (to_submit > IO_PLUG_THRESHOLD) { io_submit_state_start(&state, ctx, to_submit); @@ -1935,6 +2158,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) for (i = 0; i < to_submit; i++) { struct sqe_submit s; + int ret; if (!io_get_sqring(ctx, &s)) break; @@ -1942,21 +2166,18 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) s.has_user = true; s.needs_lock = false; s.needs_fixed_file = false; + submit++; ret = io_submit_sqe(ctx, &s, statep); - if (ret) { - io_drop_sqring(ctx); - break; - } - - submit++; + if (ret) + io_cqring_add_event(ctx, s.sqe->user_data, ret, 0); } io_commit_sqring(ctx); if (statep) io_submit_state_end(statep); - return submit ? submit : ret; + return submit; } static unsigned io_cqring_events(struct io_cq_ring *ring) @@ -2054,6 +2275,7 @@ static void io_sq_thread_stop(struct io_ring_ctx *ctx) if (ctx->sqo_thread) { ctx->sqo_stop = 1; mb(); + kthread_park(ctx->sqo_thread); kthread_stop(ctx->sqo_thread); ctx->sqo_thread = NULL; } @@ -2141,7 +2363,6 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx) left = ctx->nr_user_files; while (left) { unsigned this_files = min_t(unsigned, left, SCM_MAX_FD); - int ret; ret = __io_sqe_files_scm(ctx, this_files, total); if (ret) @@ -2215,6 +2436,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, fput(ctx->user_files[i]); kfree(ctx->user_files); + ctx->user_files = NULL; ctx->nr_user_files = 0; return ret; } @@ -2235,19 +2457,23 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, mmgrab(current->mm); ctx->sqo_mm = current->mm; - ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); - if (!ctx->sq_thread_idle) - ctx->sq_thread_idle = HZ; + if (ctx->flags & IORING_SETUP_SQPOLL) { + ret = -EPERM; + if (!capable(CAP_SYS_ADMIN)) + goto err; - ret = -EINVAL; - if (!cpu_possible(p->sq_thread_cpu)) - goto err; + ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); + if (!ctx->sq_thread_idle) + ctx->sq_thread_idle = HZ; - if (ctx->flags & IORING_SETUP_SQPOLL) { if (p->flags & IORING_SETUP_SQ_AFF) { - int cpu; + int cpu = array_index_nospec(p->sq_thread_cpu, + nr_cpu_ids); + + ret = -EINVAL; + if (!cpu_online(cpu)) + goto err; - cpu = array_index_nospec(p->sq_thread_cpu, NR_CPUS); ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread, ctx, cpu, "io_uring-sq"); @@ -2308,8 +2534,12 @@ static int io_account_mem(struct user_struct *user, unsigned long nr_pages) static void io_mem_free(void *ptr) { - struct page *page = virt_to_head_page(ptr); + struct page *page; + + if (!ptr) + return; + page = virt_to_head_page(ptr); if (put_page_testzero(page)) free_compound_page(page); } @@ -2350,7 +2580,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) if (ctx->account_mem) io_unaccount_mem(ctx->user, imu->nr_bvecs); - kfree(imu->bvec); + kvfree(imu->bvec); imu->nr_bvecs = 0; } @@ -2442,9 +2672,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, if (!pages || nr_pages > got_pages) { kfree(vmas); kfree(pages); - pages = kmalloc_array(nr_pages, sizeof(struct page *), + pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); - vmas = kmalloc_array(nr_pages, + vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *), GFP_KERNEL); if (!pages || !vmas) { @@ -2456,7 +2686,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, got_pages = nr_pages; } - imu->bvec = kmalloc_array(nr_pages, sizeof(struct bio_vec), + imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec), GFP_KERNEL); ret = -ENOMEM; if (!imu->bvec) { @@ -2495,6 +2725,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, } if (ctx->account_mem) io_unaccount_mem(ctx->user, nr_pages); + kvfree(imu->bvec); goto err; } @@ -2517,16 +2748,48 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, ctx->nr_user_bufs++; } - kfree(pages); - kfree(vmas); + kvfree(pages); + kvfree(vmas); return 0; err: - kfree(pages); - kfree(vmas); + kvfree(pages); + kvfree(vmas); io_sqe_buffer_unregister(ctx); return ret; } +static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg) +{ + __s32 __user *fds = arg; + int fd; + + if (ctx->cq_ev_fd) + return -EBUSY; + + if (copy_from_user(&fd, fds, sizeof(*fds))) + return -EFAULT; + + ctx->cq_ev_fd = eventfd_ctx_fdget(fd); + if (IS_ERR(ctx->cq_ev_fd)) { + int ret = PTR_ERR(ctx->cq_ev_fd); + ctx->cq_ev_fd = NULL; + return ret; + } + + return 0; +} + +static int io_eventfd_unregister(struct io_ring_ctx *ctx) +{ + if (ctx->cq_ev_fd) { + eventfd_ctx_put(ctx->cq_ev_fd); + ctx->cq_ev_fd = NULL; + return 0; + } + + return -ENXIO; +} + static void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_finish_async(ctx); @@ -2536,6 +2799,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) io_iopoll_reap_events(ctx); io_sqe_buffer_unregister(ctx); io_sqe_files_unregister(ctx); + io_eventfd_unregister(ctx); #if defined(CONFIG_UNIX) if (ctx->ring_sock) @@ -2560,9 +2824,13 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) __poll_t mask = 0; poll_wait(file, &ctx->cq_wait, wait); - /* See comment at the top of this file */ + /* + * synchronizes with barrier from wq_has_sleeper call in + * io_commit_cqring + */ smp_rmb(); - if (READ_ONCE(ctx->sq_ring->r.tail) + 1 != ctx->cached_sq_head) + if (READ_ONCE(ctx->sq_ring->r.tail) - ctx->cached_sq_head != + ctx->sq_ring->ring_entries) mask |= EPOLLOUT | EPOLLWRNORM; if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail) mask |= EPOLLIN | EPOLLRDNORM; @@ -2673,24 +2941,12 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, mutex_lock(&ctx->uring_lock); submitted = io_ring_submit(ctx, to_submit); mutex_unlock(&ctx->uring_lock); - - if (submitted < 0) - goto out_ctx; } if (flags & IORING_ENTER_GETEVENTS) { unsigned nr_events = 0; min_complete = min(min_complete, ctx->cq_entries); - /* - * The application could have included the 'to_submit' count - * in how many events it wanted to wait for. If we failed to - * submit the desired count, we may need to adjust the number - * of events to poll/wait for. - */ - if (submitted < to_submit) - min_complete = min_t(unsigned, submitted, min_complete); - if (ctx->flags & IORING_SETUP_IOPOLL) { mutex_lock(&ctx->uring_lock); ret = io_iopoll_check(ctx, &nr_events, min_complete); @@ -2736,17 +2992,12 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx, return -EOVERFLOW; ctx->sq_sqes = io_mem_alloc(size); - if (!ctx->sq_sqes) { - io_mem_free(ctx->sq_ring); + if (!ctx->sq_sqes) return -ENOMEM; - } cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries)); - if (!cq_ring) { - io_mem_free(ctx->sq_ring); - io_mem_free(ctx->sq_sqes); + if (!cq_ring) return -ENOMEM; - } ctx->cq_ring = cq_ring; cq_ring->ring_mask = p->cq_entries - 1; @@ -2917,11 +3168,31 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries, static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) + __releases(ctx->uring_lock) + __acquires(ctx->uring_lock) { int ret; + /* + * We're inside the ring mutex, if the ref is already dying, then + * someone else killed the ctx or is already going through + * io_uring_register(). + */ + if (percpu_ref_is_dying(&ctx->refs)) + return -ENXIO; + percpu_ref_kill(&ctx->refs); + + /* + * Drop uring mutex before waiting for references to exit. If another + * thread is currently inside io_uring_enter() it might need to grab + * the uring_lock to make progress. If we hold it here across the drain + * wait, then we can deadlock. It's safe to drop the mutex here, since + * no new references will come in after we've killed the percpu ref. + */ + mutex_unlock(&ctx->uring_lock); wait_for_completion(&ctx->ctx_done); + mutex_lock(&ctx->uring_lock); switch (opcode) { case IORING_REGISTER_BUFFERS: @@ -2942,6 +3213,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_sqe_files_unregister(ctx); break; + case IORING_REGISTER_EVENTFD: + ret = -EINVAL; + if (nr_args != 1) + break; + ret = io_eventfd_register(ctx, arg); + break; + case IORING_UNREGISTER_EVENTFD: + ret = -EINVAL; + if (arg || nr_args) + break; + ret = io_eventfd_unregister(ctx); + break; default: ret = -EINVAL; break; diff --git a/fs/iomap.c b/fs/iomap.c index abdd18e404f8..23ef63fd1669 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1,15 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Red Hat, Inc. * Copyright (c) 2016-2018 Christoph Hellwig. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include <linux/module.h> #include <linux/compiler.h> @@ -249,6 +241,26 @@ iomap_read_page_end_io(struct bio_vec *bvec, int error) } static void +iomap_read_end_io(struct bio *bio) +{ + int error = blk_status_to_errno(bio->bi_status); + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, iter_all) + iomap_read_page_end_io(bvec, error); + bio_put(bio); +} + +struct iomap_readpage_ctx { + struct page *cur_page; + bool cur_page_in_bio; + bool is_readahead; + struct bio *bio; + struct list_head *pages; +}; + +static void iomap_read_inline_data(struct inode *inode, struct page *page, struct iomap *iomap) { @@ -268,27 +280,6 @@ iomap_read_inline_data(struct inode *inode, struct page *page, SetPageUptodate(page); } -static void -iomap_read_end_io(struct bio *bio) -{ - int error = blk_status_to_errno(bio->bi_status); - struct bio_vec *bvec; - int i; - struct bvec_iter_all iter_all; - - bio_for_each_segment_all(bvec, bio, i, iter_all) - iomap_read_page_end_io(bvec, error); - bio_put(bio); -} - -struct iomap_readpage_ctx { - struct page *cur_page; - bool cur_page_in_bio; - bool is_readahead; - struct bio *bio; - struct list_head *pages; -}; - static loff_t iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, struct iomap *iomap) @@ -665,6 +656,7 @@ static int iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, struct page **pagep, struct iomap *iomap) { + const struct iomap_page_ops *page_ops = iomap->page_ops; pgoff_t index = pos >> PAGE_SHIFT; struct page *page; int status = 0; @@ -674,9 +666,17 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, if (fatal_signal_pending(current)) return -EINTR; + if (page_ops && page_ops->page_prepare) { + status = page_ops->page_prepare(inode, pos, len, iomap); + if (status) + return status; + } + page = grab_cache_page_write_begin(inode->i_mapping, index, flags); - if (!page) - return -ENOMEM; + if (!page) { + status = -ENOMEM; + goto out_no_page; + } if (iomap->type == IOMAP_INLINE) iomap_read_inline_data(inode, page, iomap); @@ -684,15 +684,21 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, status = __block_write_begin_int(page, pos, len, NULL, iomap); else status = __iomap_write_begin(inode, pos, len, page, iomap); - if (unlikely(status)) { - unlock_page(page); - put_page(page); - page = NULL; - iomap_write_failed(inode, pos, len); - } + if (unlikely(status)) + goto out_unlock; *pagep = page; + return 0; + +out_unlock: + unlock_page(page); + put_page(page); + iomap_write_failed(inode, pos, len); + +out_no_page: + if (page_ops && page_ops->page_done) + page_ops->page_done(inode, pos, 0, NULL, iomap); return status; } @@ -738,13 +744,11 @@ __iomap_write_end(struct inode *inode, loff_t pos, unsigned len, * uptodate page as a zero-length write, and force the caller to redo * the whole thing. */ - if (unlikely(copied < len && !PageUptodate(page))) { - copied = 0; - } else { - iomap_set_range_uptodate(page, offset_in_page(pos), len); - iomap_set_page_dirty(page); - } - return __generic_write_end(inode, pos, copied, page); + if (unlikely(copied < len && !PageUptodate(page))) + return 0; + iomap_set_range_uptodate(page, offset_in_page(pos), len); + iomap_set_page_dirty(page); + return copied; } static int @@ -761,7 +765,6 @@ iomap_write_end_inline(struct inode *inode, struct page *page, kunmap_atomic(addr); mark_inode_dirty(inode); - __generic_write_end(inode, pos, copied, page); return copied; } @@ -769,19 +772,22 @@ static int iomap_write_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct page *page, struct iomap *iomap) { + const struct iomap_page_ops *page_ops = iomap->page_ops; int ret; if (iomap->type == IOMAP_INLINE) { ret = iomap_write_end_inline(inode, page, iomap, pos, copied); } else if (iomap->flags & IOMAP_F_BUFFER_HEAD) { - ret = generic_write_end(NULL, inode->i_mapping, pos, len, - copied, page, NULL); + ret = block_write_end(NULL, inode->i_mapping, pos, len, copied, + page, NULL); } else { ret = __iomap_write_end(inode, pos, len, copied, page, iomap); } - if (iomap->page_done) - iomap->page_done(inode, pos, copied, page, iomap); + __generic_write_end(inode, pos, ret, page); + if (page_ops && page_ops->page_done) + page_ops->page_done(inode, pos, copied, page, iomap); + put_page(page); if (ret < len) iomap_write_failed(inode, pos, len); @@ -1592,9 +1598,8 @@ static void iomap_dio_bio_end_io(struct bio *bio) if (!bio_flagged(bio, BIO_NO_PAGE_REF)) { struct bvec_iter_all iter_all; struct bio_vec *bvec; - int i; - bio_for_each_segment_all(bvec, bio, i, iter_all) + bio_for_each_segment_all(bvec, bio, iter_all) put_page(bvec->bv_page); } bio_put(bio); diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 488a9e7f8f66..603b052a3c94 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -72,17 +72,11 @@ static struct inode *isofs_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void isofs_i_callback(struct rcu_head *head) +static void isofs_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode)); } -static void isofs_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, isofs_i_callback); -} - static void init_once(void *foo) { struct iso_inode_info *ei = foo; @@ -122,7 +116,7 @@ static int isofs_remount(struct super_block *sb, int *flags, char *data) static const struct super_operations isofs_sops = { .alloc_inode = isofs_alloc_inode, - .destroy_inode = isofs_destroy_inode, + .free_inode = isofs_free_inode, .put_super = isofs_put_super, .statfs = isofs_statfs, .remount_fs = isofs_remount, diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index eab04eca95a3..112d85849db1 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -340,6 +340,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) rdev = old_decode_dev(je16_to_cpu(jdev.old_id)); else rdev = new_decode_dev(je32_to_cpu(jdev.new_id)); + /* fall through */ case S_IFSOCK: case S_IFIFO: diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index 389ea53ea487..bccfc40b3a74 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -1414,11 +1414,6 @@ void jffs2_do_clear_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f) jffs2_kill_fragtree(&f->fragtree, deleted?c:NULL); - if (f->target) { - kfree(f->target); - f->target = NULL; - } - fds = f->dents; while(fds) { fd = fds; diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index bb6ae387469f..af4aa6599473 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -44,15 +44,12 @@ static struct inode *jffs2_alloc_inode(struct super_block *sb) return &f->vfs_inode; } -static void jffs2_i_callback(struct rcu_head *head) +static void jffs2_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); - kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode)); -} + struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); -static void jffs2_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, jffs2_i_callback); + kfree(f->target); + kmem_cache_free(jffs2_inode_cachep, f); } static void jffs2_i_init_once(void *foo) @@ -255,7 +252,7 @@ static int jffs2_remount_fs(struct super_block *sb, int *flags, char *data) static const struct super_operations jffs2_super_operations = { .alloc_inode = jffs2_alloc_inode, - .destroy_inode =jffs2_destroy_inode, + .free_inode = jffs2_free_inode, .put_super = jffs2_put_super, .statfs = jffs2_statfs, .remount_fs = jffs2_remount_fs, diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 8c06a6ea862d..ebb299003a5b 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -117,7 +117,8 @@ int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) rc = posix_acl_update_mode(inode, &mode, &acl); if (rc) goto end_tx; - update_mode = 1; + if (mode != inode->i_mode) + update_mode = 1; } rc = __jfs_set_acl(tid, inode, type, acl); if (!rc) { diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 805ae9e8944a..f2b92b292abe 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -31,6 +31,7 @@ #include "jfs_extent.h" #include "jfs_unicode.h" #include "jfs_debug.h" +#include "jfs_dmap.h" struct inode *jfs_iget(struct super_block *sb, unsigned long ino) @@ -150,6 +151,8 @@ int jfs_write_inode(struct inode *inode, struct writeback_control *wbc) void jfs_evict_inode(struct inode *inode) { + struct jfs_inode_info *ji = JFS_IP(inode); + jfs_info("In jfs_evict_inode, inode = 0x%p", inode); if (!inode->i_nlink && !is_bad_inode(inode)) { @@ -173,6 +176,16 @@ void jfs_evict_inode(struct inode *inode) } clear_inode(inode); dquot_drop(inode); + + BUG_ON(!list_empty(&ji->anon_inode_list)); + + spin_lock_irq(&ji->ag_lock); + if (ji->active_ag != -1) { + struct bmap *bmap = JFS_SBI(inode->i_sb)->bmap; + atomic_dec(&bmap->db_active[ji->active_ag]); + ji->active_ag = -1; + } + spin_unlock_irq(&ji->ag_lock); } void jfs_dirty_inode(struct inode *inode, int flags) diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h index 912a3af2393e..340eb8e4f716 100644 --- a/fs/jfs/jfs_incore.h +++ b/fs/jfs/jfs_incore.h @@ -23,6 +23,8 @@ #include <linux/rwsem.h> #include <linux/slab.h> #include <linux/bitops.h> +#include <linux/uuid.h> + #include "jfs_types.h" #include "jfs_xtree.h" #include "jfs_dtree.h" @@ -178,8 +180,8 @@ struct jfs_sb_info { pxd_t logpxd; /* pxd describing log */ pxd_t fsckpxd; /* pxd describing fsck wkspc */ pxd_t ait2; /* pxd describing AIT copy */ - char uuid[16]; /* 128-bit uuid for volume */ - char loguuid[16]; /* 128-bit uuid for log */ + uuid_t uuid; /* 128-bit uuid for volume */ + uuid_t loguuid; /* 128-bit uuid for log */ /* * commit_state is used for synchronization of the jfs_commit * threads. It is protected by LAZY_LOCK(). diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 6b68df395892..4c77b808020b 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1092,8 +1092,7 @@ int lmLogOpen(struct super_block *sb) mutex_lock(&jfs_log_mutex); list_for_each_entry(log, &jfs_external_logs, journal_list) { if (log->bdev->bd_dev == sbi->logdev) { - if (memcmp(log->uuid, sbi->loguuid, - sizeof(log->uuid))) { + if (!uuid_equal(&log->uuid, &sbi->loguuid)) { jfs_warn("wrong uuid on JFS journal"); mutex_unlock(&jfs_log_mutex); return -EINVAL; @@ -1130,7 +1129,7 @@ int lmLogOpen(struct super_block *sb) } log->bdev = bdev; - memcpy(log->uuid, sbi->loguuid, sizeof(log->uuid)); + uuid_copy(&log->uuid, &sbi->loguuid); /* * initialize log: @@ -1336,7 +1335,7 @@ int lmLogInit(struct jfs_log * log) jfs_info("lmLogInit: inline log:0x%p base:0x%Lx size:0x%x", log, (unsigned long long)log->base, log->size); } else { - if (memcmp(logsuper->uuid, log->uuid, 16)) { + if (!uuid_equal(&logsuper->uuid, &log->uuid)) { jfs_warn("wrong uuid on JFS log device"); goto errout20; } @@ -1732,7 +1731,7 @@ static int lmLogFileSystem(struct jfs_log * log, struct jfs_sb_info *sbi, int i; struct logsuper *logsuper; struct lbuf *bpsuper; - char *uuid = sbi->uuid; + uuid_t *uuid = &sbi->uuid; /* * insert/remove file system device to log active file system list. @@ -1743,8 +1742,8 @@ static int lmLogFileSystem(struct jfs_log * log, struct jfs_sb_info *sbi, logsuper = (struct logsuper *) bpsuper->l_ldata; if (activate) { for (i = 0; i < MAX_ACTIVE; i++) - if (!memcmp(logsuper->active[i].uuid, NULL_UUID, 16)) { - memcpy(logsuper->active[i].uuid, uuid, 16); + if (uuid_is_null(&logsuper->active[i].uuid)) { + uuid_copy(&logsuper->active[i].uuid, uuid); sbi->aggregate = i; break; } @@ -1755,8 +1754,9 @@ static int lmLogFileSystem(struct jfs_log * log, struct jfs_sb_info *sbi, } } else { for (i = 0; i < MAX_ACTIVE; i++) - if (!memcmp(logsuper->active[i].uuid, uuid, 16)) { - memcpy(logsuper->active[i].uuid, NULL_UUID, 16); + if (uuid_equal(&logsuper->active[i].uuid, uuid)) { + uuid_copy(&logsuper->active[i].uuid, + &uuid_null); break; } if (i == MAX_ACTIVE) { diff --git a/fs/jfs/jfs_logmgr.h b/fs/jfs/jfs_logmgr.h index e38c21598850..870fc22360e7 100644 --- a/fs/jfs/jfs_logmgr.h +++ b/fs/jfs/jfs_logmgr.h @@ -19,6 +19,8 @@ #ifndef _H_JFS_LOGMGR #define _H_JFS_LOGMGR +#include <linux/uuid.h> + #include "jfs_filsys.h" #include "jfs_lock.h" @@ -73,15 +75,13 @@ struct logsuper { __le32 state; /* 4: state - see below */ __le32 end; /* 4: addr of last log record set by logredo */ - char uuid[16]; /* 16: 128-bit journal uuid */ + uuid_t uuid; /* 16: 128-bit journal uuid */ char label[16]; /* 16: journal label */ struct { - char uuid[16]; + uuid_t uuid; } active[MAX_ACTIVE]; /* 2048: active file systems list */ }; -#define NULL_UUID "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" - /* log flag: commit option (see jfs_filsys.h) */ /* log state */ @@ -410,7 +410,7 @@ struct jfs_log { spinlock_t synclock; /* 4: synclist lock */ struct lbuf *wqueue; /* 4: log pageout queue */ int count; /* 4: count */ - char uuid[16]; /* 16: 128-bit uuid of log device */ + uuid_t uuid; /* 16: 128-bit uuid of log device */ int no_integrity; /* 3: flag to disable journaling to disk */ }; diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c index d8658607bf46..c9c1f16b93df 100644 --- a/fs/jfs/jfs_mount.c +++ b/fs/jfs/jfs_mount.c @@ -389,8 +389,8 @@ static int chkSuper(struct super_block *sb) sbi->logpxd = j_sb->s_logpxd; else { sbi->logdev = new_decode_dev(le32_to_cpu(j_sb->s_logdev)); - memcpy(sbi->uuid, j_sb->s_uuid, sizeof(sbi->uuid)); - memcpy(sbi->loguuid, j_sb->s_loguuid, sizeof(sbi->uuid)); + uuid_copy(&sbi->uuid, &j_sb->s_uuid); + uuid_copy(&sbi->loguuid, &j_sb->s_loguuid); } sbi->fsckpxd = j_sb->s_fsckpxd; sbi->ait2 = j_sb->s_ait2; diff --git a/fs/jfs/jfs_superblock.h b/fs/jfs/jfs_superblock.h index 04847b8d3070..eb03de7fa925 100644 --- a/fs/jfs/jfs_superblock.h +++ b/fs/jfs/jfs_superblock.h @@ -18,6 +18,8 @@ #ifndef _H_JFS_SUPERBLOCK #define _H_JFS_SUPERBLOCK +#include <linux/uuid.h> + /* * make the magic number something a human could read */ @@ -98,11 +100,9 @@ struct jfs_superblock { __le64 s_xsize; /* 8: extendfs s_size */ pxd_t s_xfsckpxd; /* 8: extendfs fsckpxd */ pxd_t s_xlogpxd; /* 8: extendfs logpxd */ - /* - 128 byte boundary - */ - - char s_uuid[16]; /* 16: 128-bit uuid for volume */ + uuid_t s_uuid; /* 16: 128-bit uuid for volume */ char s_label[16]; /* 16: volume label */ - char s_loguuid[16]; /* 16: 128-bit uuid for log device */ + uuid_t s_loguuid; /* 16: 128-bit uuid for log device */ }; diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index a5663cb621d8..78789c5ed36b 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -1928,8 +1928,7 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, * header ? */ if (tlck->type & tlckTRUNCATE) { - /* This odd declaration suppresses a bogus gcc warning */ - pxd_t pxd = pxd; /* truncated extent of xad */ + pxd_t pxd; /* truncated extent of xad */ int twm; /* diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 14528c0ffe63..fa719a1553b6 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -203,7 +203,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, umode_t mode, * RETURN: Errors from subroutines * * note: - * EACCESS: user needs search+write permission on the parent directory + * EACCES: user needs search+write permission on the parent directory */ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) { diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 65d8fc87ab11..8f78fa374242 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -124,27 +124,9 @@ static struct inode *jfs_alloc_inode(struct super_block *sb) return &jfs_inode->vfs_inode; } -static void jfs_i_callback(struct rcu_head *head) +static void jfs_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); - struct jfs_inode_info *ji = JFS_IP(inode); - kmem_cache_free(jfs_inode_cachep, ji); -} - -static void jfs_destroy_inode(struct inode *inode) -{ - struct jfs_inode_info *ji = JFS_IP(inode); - - BUG_ON(!list_empty(&ji->anon_inode_list)); - - spin_lock_irq(&ji->ag_lock); - if (ji->active_ag != -1) { - struct bmap *bmap = JFS_SBI(inode->i_sb)->bmap; - atomic_dec(&bmap->db_active[ji->active_ag]); - ji->active_ag = -1; - } - spin_unlock_irq(&ji->ag_lock); - call_rcu(&inode->i_rcu, jfs_i_callback); + kmem_cache_free(jfs_inode_cachep, JFS_IP(inode)); } static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -174,9 +156,11 @@ static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = maxinodes; buf->f_ffree = maxinodes - (atomic_read(&imap->im_numinos) - atomic_read(&imap->im_numfree)); - buf->f_fsid.val[0] = (u32)crc32_le(0, sbi->uuid, sizeof(sbi->uuid)/2); - buf->f_fsid.val[1] = (u32)crc32_le(0, sbi->uuid + sizeof(sbi->uuid)/2, - sizeof(sbi->uuid)/2); + buf->f_fsid.val[0] = crc32_le(0, (char *)&sbi->uuid, + sizeof(sbi->uuid)/2); + buf->f_fsid.val[1] = crc32_le(0, + (char *)&sbi->uuid + sizeof(sbi->uuid)/2, + sizeof(sbi->uuid)/2); buf->f_namelen = JFS_NAME_MAX; return 0; @@ -912,7 +896,7 @@ out: static const struct super_operations jfs_super_operations = { .alloc_inode = jfs_alloc_inode, - .destroy_inode = jfs_destroy_inode, + .free_inode = jfs_free_inode, .dirty_inode = jfs_dirty_inode, .write_inode = jfs_write_inode, .evict_inode = jfs_evict_inode, diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index b84d635567d3..016ba88f7335 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -532,9 +532,6 @@ void kernfs_put(struct kernfs_node *kn) kfree_const(kn->name); if (kn->iattr) { - if (kn->iattr->ia_secdata) - security_release_secctx(kn->iattr->ia_secdata, - kn->iattr->ia_secdata_len); simple_xattrs_free(&kn->iattr->xattrs); kmem_cache_free(kernfs_iattrs_cache, kn->iattr); } @@ -618,6 +615,7 @@ struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry) } static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, + struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, unsigned flags) @@ -650,11 +648,10 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, kn->id.generation = gen; /* - * set ino first. This barrier is paired with atomic_inc_not_zero in + * set ino first. This RELEASE is paired with atomic_inc_not_zero in * kernfs_find_and_get_node_by_ino */ - smp_mb__before_atomic(); - atomic_set(&kn->count, 1); + atomic_set_release(&kn->count, 1); atomic_set(&kn->active, KN_DEACTIVATED_BIAS); RB_CLEAR_NODE(&kn->rb); @@ -674,6 +671,12 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, goto err_out3; } + if (parent) { + ret = security_kernfs_init_security(parent, kn); + if (ret) + goto err_out3; + } + return kn; err_out3: @@ -692,7 +695,7 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, { struct kernfs_node *kn; - kn = __kernfs_new_node(kernfs_root(parent), + kn = __kernfs_new_node(kernfs_root(parent), parent, name, mode, uid, gid, flags); if (kn) { kernfs_get(parent); @@ -795,9 +798,8 @@ int kernfs_add_one(struct kernfs_node *kn) /* Update timestamps on the parent */ ps_iattr = parent->iattr; if (ps_iattr) { - struct iattr *ps_iattrs = &ps_iattr->ia_iattr; - ktime_get_real_ts64(&ps_iattrs->ia_ctime); - ps_iattrs->ia_mtime = ps_iattrs->ia_ctime; + ktime_get_real_ts64(&ps_iattr->ia_ctime); + ps_iattr->ia_mtime = ps_iattr->ia_ctime; } mutex_unlock(&kernfs_mutex); @@ -962,7 +964,7 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, INIT_LIST_HEAD(&root->supers); root->next_generation = 1; - kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO, + kn = __kernfs_new_node(root, NULL, "", S_IFDIR | S_IRUGO | S_IXUGO, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, KERNFS_DIR); if (!kn) { @@ -1329,9 +1331,8 @@ static void __kernfs_remove(struct kernfs_node *kn) /* update timestamps on the parent */ if (ps_iattr) { - ktime_get_real_ts64(&ps_iattr->ia_iattr.ia_ctime); - ps_iattr->ia_iattr.ia_mtime = - ps_iattr->ia_iattr.ia_ctime; + ktime_get_real_ts64(&ps_iattr->ia_ctime); + ps_iattr->ia_mtime = ps_iattr->ia_ctime; } kernfs_put(pos); diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 0c1fd945ce42..f89a0f13840e 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -31,30 +31,27 @@ static const struct inode_operations kernfs_iops = { .listxattr = kernfs_iop_listxattr, }; -static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) +static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, int alloc) { static DEFINE_MUTEX(iattr_mutex); struct kernfs_iattrs *ret; - struct iattr *iattrs; mutex_lock(&iattr_mutex); - if (kn->iattr) + if (kn->iattr || !alloc) goto out_unlock; kn->iattr = kmem_cache_zalloc(kernfs_iattrs_cache, GFP_KERNEL); if (!kn->iattr) goto out_unlock; - iattrs = &kn->iattr->ia_iattr; /* assign default attributes */ - iattrs->ia_mode = kn->mode; - iattrs->ia_uid = GLOBAL_ROOT_UID; - iattrs->ia_gid = GLOBAL_ROOT_GID; + kn->iattr->ia_uid = GLOBAL_ROOT_UID; + kn->iattr->ia_gid = GLOBAL_ROOT_GID; - ktime_get_real_ts64(&iattrs->ia_atime); - iattrs->ia_mtime = iattrs->ia_atime; - iattrs->ia_ctime = iattrs->ia_atime; + ktime_get_real_ts64(&kn->iattr->ia_atime); + kn->iattr->ia_mtime = kn->iattr->ia_atime; + kn->iattr->ia_ctime = kn->iattr->ia_atime; simple_xattrs_init(&kn->iattr->xattrs); out_unlock: @@ -63,32 +60,37 @@ out_unlock: return ret; } +static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) +{ + return __kernfs_iattrs(kn, 1); +} + +static struct kernfs_iattrs *kernfs_iattrs_noalloc(struct kernfs_node *kn) +{ + return __kernfs_iattrs(kn, 0); +} + int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) { struct kernfs_iattrs *attrs; - struct iattr *iattrs; unsigned int ia_valid = iattr->ia_valid; attrs = kernfs_iattrs(kn); if (!attrs) return -ENOMEM; - iattrs = &attrs->ia_iattr; - if (ia_valid & ATTR_UID) - iattrs->ia_uid = iattr->ia_uid; + attrs->ia_uid = iattr->ia_uid; if (ia_valid & ATTR_GID) - iattrs->ia_gid = iattr->ia_gid; + attrs->ia_gid = iattr->ia_gid; if (ia_valid & ATTR_ATIME) - iattrs->ia_atime = iattr->ia_atime; + attrs->ia_atime = iattr->ia_atime; if (ia_valid & ATTR_MTIME) - iattrs->ia_mtime = iattr->ia_mtime; + attrs->ia_mtime = iattr->ia_mtime; if (ia_valid & ATTR_CTIME) - iattrs->ia_ctime = iattr->ia_ctime; - if (ia_valid & ATTR_MODE) { - umode_t mode = iattr->ia_mode; - iattrs->ia_mode = kn->mode = mode; - } + attrs->ia_ctime = iattr->ia_ctime; + if (ia_valid & ATTR_MODE) + kn->mode = iattr->ia_mode; return 0; } @@ -135,23 +137,6 @@ out: return error; } -static int kernfs_node_setsecdata(struct kernfs_iattrs *attrs, void **secdata, - u32 *secdata_len) -{ - void *old_secdata; - size_t old_secdata_len; - - old_secdata = attrs->ia_secdata; - old_secdata_len = attrs->ia_secdata_len; - - attrs->ia_secdata = *secdata; - attrs->ia_secdata_len = *secdata_len; - - *secdata = old_secdata; - *secdata_len = old_secdata_len; - return 0; -} - ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size) { struct kernfs_node *kn = kernfs_dentry_node(dentry); @@ -171,14 +156,15 @@ static inline void set_default_inode_attr(struct inode *inode, umode_t mode) inode->i_ctime = current_time(inode); } -static inline void set_inode_attr(struct inode *inode, struct iattr *iattr) +static inline void set_inode_attr(struct inode *inode, + struct kernfs_iattrs *attrs) { struct super_block *sb = inode->i_sb; - inode->i_uid = iattr->ia_uid; - inode->i_gid = iattr->ia_gid; - inode->i_atime = timespec64_trunc(iattr->ia_atime, sb->s_time_gran); - inode->i_mtime = timespec64_trunc(iattr->ia_mtime, sb->s_time_gran); - inode->i_ctime = timespec64_trunc(iattr->ia_ctime, sb->s_time_gran); + inode->i_uid = attrs->ia_uid; + inode->i_gid = attrs->ia_gid; + inode->i_atime = timespec64_trunc(attrs->ia_atime, sb->s_time_gran); + inode->i_mtime = timespec64_trunc(attrs->ia_mtime, sb->s_time_gran); + inode->i_ctime = timespec64_trunc(attrs->ia_ctime, sb->s_time_gran); } static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode) @@ -186,15 +172,12 @@ static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode) struct kernfs_iattrs *attrs = kn->iattr; inode->i_mode = kn->mode; - if (attrs) { + if (attrs) /* * kernfs_node has non-default attributes get them from * persistent copy in kernfs_node. */ - set_inode_attr(inode, &attrs->ia_iattr); - security_inode_notifysecctx(inode, attrs->ia_secdata, - attrs->ia_secdata_len); - } + set_inode_attr(inode, attrs); if (kernfs_type(kn) == KERNFS_DIR) set_nlink(inode, kn->dir.subdirs + 2); @@ -305,78 +288,57 @@ int kernfs_iop_permission(struct inode *inode, int mask) return generic_permission(inode, mask); } -static int kernfs_xattr_get(const struct xattr_handler *handler, - struct dentry *unused, struct inode *inode, - const char *suffix, void *value, size_t size) +int kernfs_xattr_get(struct kernfs_node *kn, const char *name, + void *value, size_t size) { - const char *name = xattr_full_name(handler, suffix); - struct kernfs_node *kn = inode->i_private; - struct kernfs_iattrs *attrs; - - attrs = kernfs_iattrs(kn); + struct kernfs_iattrs *attrs = kernfs_iattrs_noalloc(kn); if (!attrs) - return -ENOMEM; + return -ENODATA; return simple_xattr_get(&attrs->xattrs, name, value, size); } -static int kernfs_xattr_set(const struct xattr_handler *handler, - struct dentry *unused, struct inode *inode, - const char *suffix, const void *value, - size_t size, int flags) +int kernfs_xattr_set(struct kernfs_node *kn, const char *name, + const void *value, size_t size, int flags) { - const char *name = xattr_full_name(handler, suffix); - struct kernfs_node *kn = inode->i_private; - struct kernfs_iattrs *attrs; - - attrs = kernfs_iattrs(kn); + struct kernfs_iattrs *attrs = kernfs_iattrs(kn); if (!attrs) return -ENOMEM; return simple_xattr_set(&attrs->xattrs, name, value, size, flags); } -static const struct xattr_handler kernfs_trusted_xattr_handler = { - .prefix = XATTR_TRUSTED_PREFIX, - .get = kernfs_xattr_get, - .set = kernfs_xattr_set, -}; - -static int kernfs_security_xattr_set(const struct xattr_handler *handler, - struct dentry *unused, struct inode *inode, - const char *suffix, const void *value, - size_t size, int flags) +static int kernfs_vfs_xattr_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *suffix, void *value, size_t size) { + const char *name = xattr_full_name(handler, suffix); struct kernfs_node *kn = inode->i_private; - struct kernfs_iattrs *attrs; - void *secdata; - u32 secdata_len = 0; - int error; - - attrs = kernfs_iattrs(kn); - if (!attrs) - return -ENOMEM; - error = security_inode_setsecurity(inode, suffix, value, size, flags); - if (error) - return error; - error = security_inode_getsecctx(inode, &secdata, &secdata_len); - if (error) - return error; + return kernfs_xattr_get(kn, name, value, size); +} - mutex_lock(&kernfs_mutex); - error = kernfs_node_setsecdata(attrs, &secdata, &secdata_len); - mutex_unlock(&kernfs_mutex); +static int kernfs_vfs_xattr_set(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *suffix, const void *value, + size_t size, int flags) +{ + const char *name = xattr_full_name(handler, suffix); + struct kernfs_node *kn = inode->i_private; - if (secdata) - security_release_secctx(secdata, secdata_len); - return error; + return kernfs_xattr_set(kn, name, value, size, flags); } +static const struct xattr_handler kernfs_trusted_xattr_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .get = kernfs_vfs_xattr_get, + .set = kernfs_vfs_xattr_set, +}; + static const struct xattr_handler kernfs_security_xattr_handler = { .prefix = XATTR_SECURITY_PREFIX, - .get = kernfs_xattr_get, - .set = kernfs_security_xattr_set, + .get = kernfs_vfs_xattr_get, + .set = kernfs_vfs_xattr_set, }; const struct xattr_handler *kernfs_xattr_handlers[] = { diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 0b7d197a904c..3c437990f39a 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -20,9 +20,11 @@ #include <linux/fs_context.h> struct kernfs_iattrs { - struct iattr ia_iattr; - void *ia_secdata; - u32 ia_secdata_len; + kuid_t ia_uid; + kgid_t ia_gid; + struct timespec64 ia_atime; + struct timespec64 ia_mtime; + struct timespec64 ia_ctime; struct simple_xattrs xattrs; }; diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c index 162f43b80c84..eb46c3a16e2f 100644 --- a/fs/kernfs/symlink.c +++ b/fs/kernfs/symlink.c @@ -33,8 +33,8 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent, kgid_t gid = GLOBAL_ROOT_GID; if (target->iattr) { - uid = target->iattr->ia_iattr.ia_uid; - gid = target->iattr->ia_iattr.ia_gid; + uid = target->iattr->ia_uid; + gid = target->iattr->ia_gid; } kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, uid, gid, diff --git a/fs/libfs.c b/fs/libfs.c index 0fb590d79f30..9efb647917e0 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -146,9 +146,11 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) switch (whence) { case 1: offset += file->f_pos; + /* fall through */ case 0: if (offset >= 0) break; + /* fall through */ default: return -EINVAL; } diff --git a/fs/locks.c b/fs/locks.c index 71d0c6c2aac5..d7c05dde4ed8 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1476,7 +1476,7 @@ static void lease_clear_pending(struct file_lock *fl, int arg) switch (arg) { case F_UNLCK: fl->fl_flags &= ~FL_UNLOCK_PENDING; - /* fall through: */ + /* fall through */ case F_RDLCK: fl->fl_flags &= ~FL_DOWNGRADE_PENDING; } diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 72e308c3e66b..101200761f61 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -68,17 +68,11 @@ static struct inode *minix_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void minix_i_callback(struct rcu_head *head) +static void minix_free_in_core_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(minix_inode_cachep, minix_i(inode)); } -static void minix_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, minix_i_callback); -} - static void init_once(void *foo) { struct minix_inode_info *ei = (struct minix_inode_info *) foo; @@ -110,7 +104,7 @@ static void destroy_inodecache(void) static const struct super_operations minix_sops = { .alloc_inode = minix_alloc_inode, - .destroy_inode = minix_destroy_inode, + .free_inode = minix_free_in_core_inode, .write_inode = minix_write_inode, .evict_inode = minix_evict_inode, .put_super = minix_put_super, diff --git a/fs/mpage.c b/fs/mpage.c index 3f19da75178b..436a85260394 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -47,10 +47,9 @@ static void mpage_end_io(struct bio *bio) { struct bio_vec *bv; - int i; struct bvec_iter_all iter_all; - bio_for_each_segment_all(bv, bio, i, iter_all) { + bio_for_each_segment_all(bv, bio, iter_all) { struct page *page = bv->bv_page; page_endio(page, bio_op(bio), blk_status_to_errno(bio->bi_status)); diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 5f93cfacb3d1..69d02cf8cf37 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -121,7 +121,6 @@ config PNFS_FILE_LAYOUT config PNFS_BLOCK tristate depends on NFS_V4_1 && BLK_DEV_DM - depends on 64BIT || LBDAF default NFS_V4 config PNFS_FLEXFILE_LAYOUT diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 414a90d48493..f61af8307dc8 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -2055,17 +2055,11 @@ struct inode *nfs_alloc_inode(struct super_block *sb) } EXPORT_SYMBOL_GPL(nfs_alloc_inode); -static void nfs_i_callback(struct rcu_head *head) +void nfs_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(nfs_inode_cachep, NFS_I(inode)); } - -void nfs_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, nfs_i_callback); -} -EXPORT_SYMBOL_GPL(nfs_destroy_inode); +EXPORT_SYMBOL_GPL(nfs_free_inode); static inline void nfs4_init_once(struct nfs_inode *nfsi) { diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index c7cf23ae6597..331a0504eaf8 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -381,7 +381,7 @@ int nfs_check_flags(int); /* inode.c */ extern struct workqueue_struct *nfsiod_workqueue; extern struct inode *nfs_alloc_inode(struct super_block *sb); -extern void nfs_destroy_inode(struct inode *); +extern void nfs_free_inode(struct inode *); extern int nfs_write_inode(struct inode *, struct writeback_control *); extern int nfs_drop_inode(struct inode *); extern void nfs_clear_inode(struct inode *); diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index ff6f85fb676b..5196bfa7894d 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -329,9 +329,6 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, }; ssize_t err, err2; - if (!nfs_server_capable(file_inode(dst), NFS_CAP_COPY)) - return -EOPNOTSUPP; - src_lock = nfs_get_lock_context(nfs_file_open_context(src)); if (IS_ERR(src_lock)) return PTR_ERR(src_lock); diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 45b2322e092d..00d17198ee12 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -133,8 +133,10 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t count, unsigned int flags) { + if (!nfs_server_capable(file_inode(file_out), NFS_CAP_COPY)) + return -EOPNOTSUPP; if (file_inode(file_in) == file_inode(file_out)) - return -EINVAL; + return -EOPNOTSUPP; return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count); } diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index 6fb7cb6b3f4b..689977e148cb 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -50,7 +50,7 @@ struct file_system_type nfs4_referral_fs_type = { static const struct super_operations nfs4_sops = { .alloc_inode = nfs_alloc_inode, - .destroy_inode = nfs_destroy_inode, + .free_inode = nfs_free_inode, .write_inode = nfs4_write_inode, .drop_inode = nfs_drop_inode, .statfs = nfs_statfs, diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index cfcabc33e24d..602446158bfb 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -2589,7 +2589,7 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr, ARRAY_SIZE(nfs4_acl_bitmap), &hdr); rpc_prepare_reply_pages(req, args->acl_pages, 0, - args->acl_len, replen); + args->acl_len, replen + 1); encode_nops(&hdr); } @@ -2811,7 +2811,7 @@ static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, } rpc_prepare_reply_pages(req, (struct page **)&args->page, 0, - PAGE_SIZE, replen); + PAGE_SIZE, replen + 1); encode_nops(&hdr); } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 23790c7b2289..450ae77d19bf 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -309,7 +309,7 @@ struct file_system_type nfs_xdev_fs_type = { const struct super_operations nfs_sops = { .alloc_inode = nfs_alloc_inode, - .destroy_inode = nfs_destroy_inode, + .free_inode = nfs_free_inode, .write_inode = nfs_write_inode, .drop_inode = nfs_drop_inode, .statfs = nfs_statfs, @@ -2041,7 +2041,8 @@ static int nfs23_validate_mount_data(void *options, memcpy(sap, &data->addr, sizeof(data->addr)); args->nfs_server.addrlen = sizeof(data->addr); args->nfs_server.port = ntohs(data->addr.sin_port); - if (!nfs_verify_server_address(sap)) + if (sap->sa_family != AF_INET || + !nfs_verify_server_address(sap)) goto out_no_address; if (!(data->flags & NFS_MOUNT_TCP)) diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 8f933e84cec1..9bc32af4e2da 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -442,7 +442,9 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp) struct nfsd3_readdirargs *argp = rqstp->rq_argp; struct nfsd3_readdirres *resp = rqstp->rq_resp; __be32 nfserr; - int count; + int count = 0; + struct page **p; + caddr_t page_addr = NULL; dprintk("nfsd: READDIR(3) %s %d bytes at %d\n", SVCFH_fmt(&argp->fh), @@ -462,7 +464,18 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp) nfserr = nfsd_readdir(rqstp, &resp->fh, (loff_t*) &argp->cookie, &resp->common, nfs3svc_encode_entry); memcpy(resp->verf, argp->verf, 8); - resp->count = resp->buffer - argp->buffer; + count = 0; + for (p = rqstp->rq_respages + 1; p < rqstp->rq_next_page; p++) { + page_addr = page_address(*p); + + if (((caddr_t)resp->buffer >= page_addr) && + ((caddr_t)resp->buffer < page_addr + PAGE_SIZE)) { + count += (caddr_t)resp->buffer - page_addr; + break; + } + count += PAGE_SIZE; + } + resp->count = count >> 2; if (resp->offset) { loff_t offset = argp->cookie; diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 93fea246f676..8d789124ed3c 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -573,6 +573,7 @@ int nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_readdirargs *args = rqstp->rq_argp; + int len; u32 max_blocksize = svc_max_payload(rqstp); p = decode_fh(p, &args->fh); @@ -582,8 +583,14 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p) args->verf = p; p += 2; args->dircount = ~0; args->count = ntohl(*p++); - args->count = min_t(u32, args->count, max_blocksize); - args->buffer = page_address(*(rqstp->rq_next_page++)); + len = args->count = min_t(u32, args->count, max_blocksize); + + while (len > 0) { + struct page *p = *(rqstp->rq_next_page++); + if (!args->buffer) + args->buffer = page_address(p); + len -= PAGE_SIZE; + } return xdr_argsize_check(rqstp, p); } diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index d219159b98af..7caa3801ce72 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -1010,8 +1010,9 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) cb->cb_seq_status = 1; cb->cb_status = 0; if (minorversion) { - if (!nfsd41_cb_get_slot(clp, task)) + if (!cb->cb_holds_slot && !nfsd41_cb_get_slot(clp, task)) return; + cb->cb_holds_slot = true; } rpc_call_start(task); } @@ -1038,6 +1039,9 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback return true; } + if (!cb->cb_holds_slot) + goto need_restart; + switch (cb->cb_seq_status) { case 0: /* @@ -1076,6 +1080,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback cb->cb_seq_status); } + cb->cb_holds_slot = false; clear_bit(0, &clp->cl_cb_slot_busy); rpc_wake_up_next(&clp->cl_cb_waitq); dprintk("%s: freed slot, new seqid=%d\n", __func__, @@ -1283,6 +1288,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, cb->cb_seq_status = 1; cb->cb_status = 0; cb->cb_need_restart = false; + cb->cb_holds_slot = false; } void nfsd4_run_cb(struct nfsd4_callback *cb) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 0cfd257ffdaf..4680ad3bf55b 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -427,6 +427,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; reclaim = true; + /* fall through */ case NFS4_OPEN_CLAIM_FH: case NFS4_OPEN_CLAIM_DELEG_CUR_FH: status = do_open_fhandle(rqstp, cstate, open); diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 5188f9f70c78..8c8563441208 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -126,7 +126,6 @@ nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname) SHASH_DESC_ON_STACK(desc, tfm); desc->tfm = tfm; - desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; status = crypto_shash_digest(desc, clname->data, clname->len, cksum.data); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 6a45fb00c5fc..eca4a23f93c8 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -265,6 +265,7 @@ find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh, static void free_blocked_lock(struct nfsd4_blocked_lock *nbl) { + locks_delete_block(&nbl->nbl_lock); locks_release_private(&nbl->nbl_lock); kfree(nbl); } @@ -293,11 +294,18 @@ remove_blocked_locks(struct nfs4_lockowner *lo) nbl = list_first_entry(&reaplist, struct nfsd4_blocked_lock, nbl_lru); list_del_init(&nbl->nbl_lru); - locks_delete_block(&nbl->nbl_lock); free_blocked_lock(nbl); } } +static void +nfsd4_cb_notify_lock_prepare(struct nfsd4_callback *cb) +{ + struct nfsd4_blocked_lock *nbl = container_of(cb, + struct nfsd4_blocked_lock, nbl_cb); + locks_delete_block(&nbl->nbl_lock); +} + static int nfsd4_cb_notify_lock_done(struct nfsd4_callback *cb, struct rpc_task *task) { @@ -325,6 +333,7 @@ nfsd4_cb_notify_lock_release(struct nfsd4_callback *cb) } static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops = { + .prepare = nfsd4_cb_notify_lock_prepare, .done = nfsd4_cb_notify_lock_done, .release = nfsd4_cb_notify_lock_release, }; @@ -2576,6 +2585,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, break; default: /* checked by xdr code */ WARN_ON_ONCE(1); + /* fall through */ case SP4_SSV: status = nfserr_encr_alg_unsupp; goto out_nolock; @@ -4863,7 +4873,6 @@ nfs4_laundromat(struct nfsd_net *nn) nbl = list_first_entry(&reaplist, struct nfsd4_blocked_lock, nbl_lru); list_del_init(&nbl->nbl_lru); - locks_delete_block(&nbl->nbl_lock); free_blocked_lock(nbl); } out: diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 396c76755b03..9d6cb246c6c5 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -70,6 +70,7 @@ struct nfsd4_callback { int cb_seq_status; int cb_status; bool cb_need_restart; + bool cb_holds_slot; }; struct nfsd4_callback_ops { diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index a2f247b6a209..42395ba52da6 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -252,7 +252,6 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, struct nilfs_argv *, void nilfs_inode_add_blocks(struct inode *inode, int n); void nilfs_inode_sub_blocks(struct inode *inode, int n); extern struct inode *nilfs_new_inode(struct inode *, umode_t); -extern void nilfs_free_inode(struct inode *); extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int); extern void nilfs_set_inode_flags(struct inode *); extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *); @@ -289,7 +288,6 @@ static inline int nilfs_mark_inode_dirty_sync(struct inode *inode) /* super.c */ extern struct inode *nilfs_alloc_inode(struct super_block *); -extern void nilfs_destroy_inode(struct inode *); extern __printf(3, 4) void __nilfs_msg(struct super_block *sb, const char *level, diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 26290aa1023f..5729ee86da9a 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -155,21 +155,14 @@ struct inode *nilfs_alloc_inode(struct super_block *sb) return &ii->vfs_inode; } -static void nilfs_i_callback(struct rcu_head *head) +static void nilfs_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); - if (nilfs_is_metadata_file_inode(inode)) nilfs_mdt_destroy(inode); kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode)); } -void nilfs_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, nilfs_i_callback); -} - static int nilfs_sync_super(struct super_block *sb, int flag) { struct the_nilfs *nilfs = sb->s_fs_info; @@ -686,7 +679,7 @@ static int nilfs_show_options(struct seq_file *seq, struct dentry *dentry) static const struct super_operations nilfs_sops = { .alloc_inode = nilfs_alloc_inode, - .destroy_inode = nilfs_destroy_inode, + .free_inode = nilfs_free_inode, .dirty_inode = nilfs_dirty_inode, .evict_inode = nilfs_evict_inode, .put_super = nilfs_put_super, diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig index 735bfb2e9190..521dc91d2cb5 100644 --- a/fs/notify/fanotify/Kconfig +++ b/fs/notify/fanotify/Kconfig @@ -1,7 +1,6 @@ config FANOTIFY bool "Filesystem wide access notification" select FSNOTIFY - select ANON_INODES select EXPORTFS default n ---help--- diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index a34d7e003d7d..e6fde1a5c072 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -346,10 +346,16 @@ static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info) __kernel_fsid_t fsid = {}; fsnotify_foreach_obj_type(type) { + struct fsnotify_mark_connector *conn; + if (!fsnotify_iter_should_report_type(iter_info, type)) continue; - fsid = iter_info->marks[type]->connector->fsid; + conn = READ_ONCE(iter_info->marks[type]->connector); + /* Mark is just getting destroyed or created? */ + if (!conn) + continue; + fsid = conn->fsid; if (WARN_ON_ONCE(!fsid.val[0] && !fsid.val[1])) continue; return fsid; @@ -408,8 +414,12 @@ static int fanotify_handle_event(struct fsnotify_group *group, return 0; } - if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) + if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) { fsid = fanotify_get_fsid(iter_info); + /* Racing with mark destruction or creation? */ + if (!fsid.val[0] && !fsid.val[1]) + return 0; + } event = fanotify_alloc_event(group, inode, mask, data, data_type, &fsid); diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig index b981fc0c8379..0161c74e76e2 100644 --- a/fs/notify/inotify/Kconfig +++ b/fs/notify/inotify/Kconfig @@ -1,6 +1,5 @@ config INOTIFY_USER bool "Inotify support for userspace" - select ANON_INODES select FSNOTIFY default y ---help--- diff --git a/fs/notify/mark.c b/fs/notify/mark.c index d593d4269561..22acb0a79b53 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -239,13 +239,13 @@ static void fsnotify_drop_object(unsigned int type, void *objp) void fsnotify_put_mark(struct fsnotify_mark *mark) { - struct fsnotify_mark_connector *conn; + struct fsnotify_mark_connector *conn = READ_ONCE(mark->connector); void *objp = NULL; unsigned int type = FSNOTIFY_OBJ_TYPE_DETACHED; bool free_conn = false; /* Catch marks that were actually never attached to object */ - if (!mark->connector) { + if (!conn) { if (refcount_dec_and_test(&mark->refcnt)) fsnotify_final_mark_destroy(mark); return; @@ -255,10 +255,9 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) * We have to be careful so that traversals of obj_list under lock can * safely grab mark reference. */ - if (!refcount_dec_and_lock(&mark->refcnt, &mark->connector->lock)) + if (!refcount_dec_and_lock(&mark->refcnt, &conn->lock)) return; - conn = mark->connector; hlist_del_init_rcu(&mark->obj_list); if (hlist_empty(&conn->list)) { objp = fsnotify_detach_connector_from_object(conn, &type); @@ -266,7 +265,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) } else { __fsnotify_recalc_mask(conn); } - mark->connector = NULL; + WRITE_ONCE(mark->connector, NULL); spin_unlock(&conn->lock); fsnotify_drop_object(type, objp); @@ -620,7 +619,7 @@ restart: /* mark should be the last entry. last is the current last entry */ hlist_add_behind_rcu(&mark->obj_list, &last->obj_list); added: - mark->connector = conn; + WRITE_ONCE(mark->connector, conn); out_err: spin_unlock(&conn->lock); spin_unlock(&mark->lock); @@ -808,6 +807,7 @@ void fsnotify_init_mark(struct fsnotify_mark *mark, refcount_set(&mark->refcnt, 1); fsnotify_get_group(group); mark->group = group; + WRITE_ONCE(mark->connector, NULL); } /* diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index bd3221cbdd95..fb1a2b49a5da 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -332,23 +332,11 @@ struct inode *ntfs_alloc_big_inode(struct super_block *sb) return NULL; } -static void ntfs_i_callback(struct rcu_head *head) +void ntfs_free_big_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode)); } -void ntfs_destroy_big_inode(struct inode *inode) -{ - ntfs_inode *ni = NTFS_I(inode); - - ntfs_debug("Entering."); - BUG_ON(ni->page); - if (!atomic_dec_and_test(&ni->count)) - BUG(); - call_rcu(&inode->i_rcu, ntfs_i_callback); -} - static inline ntfs_inode *ntfs_alloc_extent_inode(void) { ntfs_inode *ni; @@ -2287,6 +2275,9 @@ void ntfs_evict_big_inode(struct inode *vi) ni->ext.base_ntfs_ino = NULL; } } + BUG_ON(ni->page); + if (!atomic_dec_and_test(&ni->count)) + BUG(); return; } diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h index b3c3469de6cb..58c8fd2948d3 100644 --- a/fs/ntfs/inode.h +++ b/fs/ntfs/inode.h @@ -278,7 +278,7 @@ extern struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name, u32 name_len); extern struct inode *ntfs_alloc_big_inode(struct super_block *sb); -extern void ntfs_destroy_big_inode(struct inode *inode); +extern void ntfs_free_big_inode(struct inode *inode); extern void ntfs_evict_big_inode(struct inode *vi); extern void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni); diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index bb7159f697f2..887ea8b3b000 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -2676,7 +2676,7 @@ static int ntfs_write_inode(struct inode *vi, struct writeback_control *wbc) */ static const struct super_operations ntfs_sops = { .alloc_inode = ntfs_alloc_big_inode, /* VFS: Allocate new inode. */ - .destroy_inode = ntfs_destroy_big_inode, /* VFS: Deallocate inode. */ + .free_inode = ntfs_free_big_inode, /* VFS: Deallocate inode. */ #ifdef NTFS_RW .write_inode = ntfs_write_inode, /* VFS: Write dirty inode to disk. */ diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index af2e7473956e..67dcee65fe50 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -81,6 +81,7 @@ static void o2quo_fence_self(void) default: WARN_ON(o2nm_single_cluster->cl_fence_method >= O2NM_FENCE_METHODS); + /* fall through */ case O2NM_FENCE_RESET: printk(KERN_ERR "*** ocfs2 is very sorry to be fencing this " "system by restarting ***\n"); diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 8decbe95dcec..98885181e1fe 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -349,17 +349,11 @@ static struct inode *dlmfs_alloc_inode(struct super_block *sb) return &ip->ip_vfs_inode; } -static void dlmfs_i_callback(struct rcu_head *head) +static void dlmfs_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode)); } -static void dlmfs_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, dlmfs_i_callback); -} - static void dlmfs_evict_inode(struct inode *inode) { int status; @@ -605,7 +599,7 @@ static const struct inode_operations dlmfs_root_inode_operations = { static const struct super_operations dlmfs_ops = { .statfs = simple_statfs, .alloc_inode = dlmfs_alloc_inode, - .destroy_inode = dlmfs_destroy_inode, + .free_inode = dlmfs_free_inode, .evict_inode = dlmfs_evict_inode, .drop_inode = generic_delete_inode, }; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 96ae7cedd487..8821bc7b9c72 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -134,7 +134,7 @@ static int ocfs2_get_sector(struct super_block *sb, int block, int sect_size); static struct inode *ocfs2_alloc_inode(struct super_block *sb); -static void ocfs2_destroy_inode(struct inode *inode); +static void ocfs2_free_inode(struct inode *inode); static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend); static int ocfs2_enable_quotas(struct ocfs2_super *osb); static void ocfs2_disable_quotas(struct ocfs2_super *osb); @@ -147,7 +147,7 @@ static struct dquot **ocfs2_get_dquots(struct inode *inode) static const struct super_operations ocfs2_sops = { .statfs = ocfs2_statfs, .alloc_inode = ocfs2_alloc_inode, - .destroy_inode = ocfs2_destroy_inode, + .free_inode = ocfs2_free_inode, .drop_inode = ocfs2_drop_inode, .evict_inode = ocfs2_evict_inode, .sync_fs = ocfs2_sync_fs, @@ -575,17 +575,11 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb) return &oi->vfs_inode; } -static void ocfs2_i_callback(struct rcu_head *head) +static void ocfs2_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode)); } -static void ocfs2_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, ocfs2_i_callback); -} - static unsigned long long ocfs2_max_file_offset(unsigned int bbits, unsigned int cbits) { @@ -600,7 +594,6 @@ static unsigned long long ocfs2_max_file_offset(unsigned int bbits, */ #if BITS_PER_LONG == 32 -# if defined(CONFIG_LBDAF) BUILD_BUG_ON(sizeof(sector_t) != 8); /* * We might be limited by page cache size. @@ -614,15 +607,6 @@ static unsigned long long ocfs2_max_file_offset(unsigned int bbits, */ bitshift = 31; } -# else - /* - * We are limited by the size of sector_t. Use block size, as - * that's what we expose to the VFS. - */ - bytes = 1 << bbits; - trim = 1; - bitshift = 31; -# endif #endif /* diff --git a/fs/open.c b/fs/open.c index f1c2f855fd43..9c7d724a6f67 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1215,3 +1215,22 @@ int nonseekable_open(struct inode *inode, struct file *filp) } EXPORT_SYMBOL(nonseekable_open); + +/* + * stream_open is used by subsystems that want stream-like file descriptors. + * Such file descriptors are not seekable and don't have notion of position + * (file.f_pos is always 0 and ppos passed to .read()/.write() is always NULL). + * Contrary to file descriptors of other regular files, .read() and .write() + * can run simultaneously. + * + * stream_open never fails and is marked to return int so that it could be + * directly used as file_operations.open . + */ +int stream_open(struct inode *inode, struct file *filp) +{ + filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS); + filp->f_mode |= FMODE_STREAM; + return 0; +} + +EXPORT_SYMBOL(stream_open); diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 1b2d0d2fe2ee..46655e454c55 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -336,17 +336,11 @@ static struct inode *openprom_alloc_inode(struct super_block *sb) return &oi->vfs_inode; } -static void openprom_i_callback(struct rcu_head *head) +static void openprom_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(op_inode_cachep, OP_I(inode)); } -static void openprom_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, openprom_i_callback); -} - static struct inode *openprom_iget(struct super_block *sb, ino_t ino) { struct inode *inode; @@ -375,7 +369,7 @@ static int openprom_remount(struct super_block *sb, int *flags, char *data) static const struct super_operations openprom_sops = { .alloc_inode = openprom_alloc_inode, - .destroy_inode = openprom_destroy_inode, + .free_inode = openprom_free_inode, .statfs = simple_statfs, .remount_fs = openprom_remount, }; diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index dfaee90d30bd..3784f7e8b603 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -124,11 +124,9 @@ static struct inode *orangefs_alloc_inode(struct super_block *sb) return &orangefs_inode->vfs_inode; } -static void orangefs_i_callback(struct rcu_head *head) +static void orangefs_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); - struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); - kmem_cache_free(orangefs_inode_cache, orangefs_inode); + kmem_cache_free(orangefs_inode_cache, ORANGEFS_I(inode)); } static void orangefs_destroy_inode(struct inode *inode) @@ -138,8 +136,6 @@ static void orangefs_destroy_inode(struct inode *inode) gossip_debug(GOSSIP_SUPER_DEBUG, "%s: deallocated %p destroying inode %pU\n", __func__, orangefs_inode, get_khandle_from_ino(inode)); - - call_rcu(&inode->i_rcu, orangefs_i_callback); } /* @@ -299,6 +295,7 @@ void fsid_key_table_finalize(void) static const struct super_operations orangefs_s_ops = { .alloc_inode = orangefs_alloc_inode, + .free_inode = orangefs_free_inode, .destroy_inode = orangefs_destroy_inode, .drop_inode = generic_delete_inode, .statfs = orangefs_statfs, diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 0116735cc321..5ec4fc2f5d7e 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -190,11 +190,13 @@ static struct inode *ovl_alloc_inode(struct super_block *sb) return &oi->vfs_inode; } -static void ovl_i_callback(struct rcu_head *head) +static void ovl_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); + struct ovl_inode *oi = OVL_I(inode); - kmem_cache_free(ovl_inode_cachep, OVL_I(inode)); + kfree(oi->redirect); + mutex_destroy(&oi->lock); + kmem_cache_free(ovl_inode_cachep, oi); } static void ovl_destroy_inode(struct inode *inode) @@ -207,10 +209,6 @@ static void ovl_destroy_inode(struct inode *inode) ovl_dir_cache_free(inode); else iput(oi->lowerdata); - kfree(oi->redirect); - mutex_destroy(&oi->lock); - - call_rcu(&inode->i_rcu, ovl_i_callback); } static void ovl_free_fs(struct ovl_fs *ofs) @@ -377,6 +375,7 @@ static int ovl_remount(struct super_block *sb, int *flags, char *data) static const struct super_operations ovl_super_operations = { .alloc_inode = ovl_alloc_inode, + .free_inode = ovl_free_inode, .destroy_inode = ovl_destroy_inode, .drop_inode = generic_delete_inode, .put_super = ovl_put_super, diff --git a/fs/pipe.c b/fs/pipe.c index 070aad543382..41065901106b 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -188,9 +188,9 @@ EXPORT_SYMBOL(generic_pipe_buf_steal); * in the tee() system call, when we duplicate the buffers in one * pipe into another. */ -void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) +bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - get_page(buf->page); + return try_get_page(buf->page); } EXPORT_SYMBOL(generic_pipe_buf_get); diff --git a/fs/proc/base.c b/fs/proc/base.c index ddef482f1334..b6ccb6c57706 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -407,7 +407,6 @@ static void unlock_trace(struct task_struct *task) static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { - struct stack_trace trace; unsigned long *entries; int err; @@ -430,20 +429,17 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, if (!entries) return -ENOMEM; - trace.nr_entries = 0; - trace.max_entries = MAX_STACK_TRACE_DEPTH; - trace.entries = entries; - trace.skip = 0; - err = lock_trace(task); if (!err) { - unsigned int i; + unsigned int i, nr_entries; - save_stack_trace_tsk(task, &trace); + nr_entries = stack_trace_save_tsk(task, entries, + MAX_STACK_TRACE_DEPTH, 0); - for (i = 0; i < trace.nr_entries; i++) { + for (i = 0; i < nr_entries; i++) { seq_printf(m, "[<0>] %pB\n", (void *)entries[i]); } + unlock_trace(task); } kfree(entries); @@ -489,10 +485,9 @@ static int lstats_show_proc(struct seq_file *m, void *v) lr->count, lr->time, lr->max); for (q = 0; q < LT_BACKTRACEDEPTH; q++) { unsigned long bt = lr->backtrace[q]; + if (!bt) break; - if (bt == ULONG_MAX) - break; seq_printf(m, " %ps", (void *)bt); } seq_putc(m, '\n'); @@ -616,24 +611,25 @@ static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns, static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { - long nr; - unsigned long args[6], sp, pc; + struct syscall_info info; + u64 *args = &info.data.args[0]; int res; res = lock_trace(task); if (res) return res; - if (task_current_syscall(task, &nr, args, 6, &sp, &pc)) + if (task_current_syscall(task, &info)) seq_puts(m, "running\n"); - else if (nr < 0) - seq_printf(m, "%ld 0x%lx 0x%lx\n", nr, sp, pc); + else if (info.data.nr < 0) + seq_printf(m, "%d 0x%llx 0x%llx\n", + info.data.nr, info.sp, info.data.instruction_pointer); else seq_printf(m, - "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n", - nr, + "%d 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx\n", + info.data.nr, args[0], args[1], args[2], args[3], args[4], args[5], - sp, pc); + info.sp, info.data.instruction_pointer); unlock_trace(task); return 0; @@ -2539,6 +2535,11 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, rcu_read_unlock(); return -EACCES; } + /* Prevent changes to overridden credentials. */ + if (current_cred() != current_real_cred()) { + rcu_read_unlock(); + return -EBUSY; + } rcu_read_unlock(); if (count > PAGE_SIZE) diff --git a/fs/proc/inode.c b/fs/proc/inode.c index fc7e38def174..5f8d215b3fd0 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -72,17 +72,11 @@ static struct inode *proc_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void proc_i_callback(struct rcu_head *head) +static void proc_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(proc_inode_cachep, PROC_I(inode)); } -static void proc_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, proc_i_callback); -} - static void init_once(void *foo) { struct proc_inode *ei = (struct proc_inode *) foo; @@ -123,7 +117,7 @@ static int proc_show_options(struct seq_file *seq, struct dentry *root) const struct super_operations proc_sops = { .alloc_inode = proc_alloc_inode, - .destroy_inode = proc_destroy_inode, + .free_inode = proc_free_inode, .drop_inode = generic_delete_inode, .evict_inode = proc_evict_inode, .statfs = simple_statfs, diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index d65390727541..7325baa8f9d4 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1626,9 +1626,11 @@ static void drop_sysctl_table(struct ctl_table_header *header) if (--header->nreg) return; - if (parent) + if (parent) { put_links(header); - start_unregistering(header); + start_unregistering(header); + } + if (!--header->count) kfree_rcu(header, rcu); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 92a91e7816d8..95ca1fe7283c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1143,6 +1143,24 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, count = -EINTR; goto out_mm; } + /* + * Avoid to modify vma->vm_flags + * without locked ops while the + * coredump reads the vm_flags. + */ + if (!mmget_still_valid(mm)) { + /* + * Silently return "count" + * like if get_task_mm() + * failed. FIXME: should this + * function have returned + * -ESRCH if get_task_mm() + * failed like if + * get_proc_task() fails? + */ + up_write(&mm->mmap_sem); + goto out_mm; + } for (vma = mm->mmap; vma; vma = vma->vm_next) { vma->vm_flags &= ~VM_SOFTDIRTY; vma_set_page_prot(vma); diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index c60ee46f3e39..29e94e0b6d73 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -115,7 +115,7 @@ static int pstore_ftrace_seq_show(struct seq_file *s, void *v) rec = (struct pstore_ftrace_record *)(ps->record->buf + data->off); - seq_printf(s, "CPU:%d ts:%llu %08lx %08lx %pf <- %pF\n", + seq_printf(s, "CPU:%d ts:%llu %08lx %08lx %ps <- %pS\n", pstore_ftrace_decode_cpu(rec), pstore_ftrace_read_timestamp(rec), rec->ip, rec->parent_ip, (void *)rec->ip, diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 3d46fe302fcb..48c70aa4a3ec 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -28,14 +28,14 @@ static const struct super_operations qnx4_sops; static struct inode *qnx4_alloc_inode(struct super_block *sb); -static void qnx4_destroy_inode(struct inode *inode); +static void qnx4_free_inode(struct inode *inode); static int qnx4_remount(struct super_block *sb, int *flags, char *data); static int qnx4_statfs(struct dentry *, struct kstatfs *); static const struct super_operations qnx4_sops = { .alloc_inode = qnx4_alloc_inode, - .destroy_inode = qnx4_destroy_inode, + .free_inode = qnx4_free_inode, .statfs = qnx4_statfs, .remount_fs = qnx4_remount, }; @@ -342,17 +342,11 @@ static struct inode *qnx4_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void qnx4_i_callback(struct rcu_head *head) +static void qnx4_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(qnx4_inode_cachep, qnx4_i(inode)); } -static void qnx4_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, qnx4_i_callback); -} - static void init_once(void *foo) { struct qnx4_inode_info *ei = (struct qnx4_inode_info *) foo; diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 4aeb26bcb4d0..59cf45f6be49 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -29,14 +29,14 @@ static const struct super_operations qnx6_sops; static void qnx6_put_super(struct super_block *sb); static struct inode *qnx6_alloc_inode(struct super_block *sb); -static void qnx6_destroy_inode(struct inode *inode); +static void qnx6_free_inode(struct inode *inode); static int qnx6_remount(struct super_block *sb, int *flags, char *data); static int qnx6_statfs(struct dentry *dentry, struct kstatfs *buf); static int qnx6_show_options(struct seq_file *seq, struct dentry *root); static const struct super_operations qnx6_sops = { .alloc_inode = qnx6_alloc_inode, - .destroy_inode = qnx6_destroy_inode, + .free_inode = qnx6_free_inode, .put_super = qnx6_put_super, .statfs = qnx6_statfs, .remount_fs = qnx6_remount, @@ -602,17 +602,11 @@ static struct inode *qnx6_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void qnx6_i_callback(struct rcu_head *head) +static void qnx6_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(qnx6_inode_cachep, QNX6_I(inode)); } -static void qnx6_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, qnx6_i_callback); -} - static void init_once(void *foo) { struct qnx6_inode_info *ei = (struct qnx6_inode_info *) foo; diff --git a/fs/read_write.c b/fs/read_write.c index 177ccc3d405a..c543d965e288 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -365,29 +365,37 @@ out_putf: int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count) { struct inode *inode; - loff_t pos; int retval = -EINVAL; inode = file_inode(file); if (unlikely((ssize_t) count < 0)) return retval; - pos = *ppos; - if (unlikely(pos < 0)) { - if (!unsigned_offsets(file)) - return retval; - if (count >= -pos) /* both values are in 0..LLONG_MAX */ - return -EOVERFLOW; - } else if (unlikely((loff_t) (pos + count) < 0)) { - if (!unsigned_offsets(file)) - return retval; - } - if (unlikely(inode->i_flctx && mandatory_lock(inode))) { - retval = locks_mandatory_area(inode, file, pos, pos + count - 1, - read_write == READ ? F_RDLCK : F_WRLCK); - if (retval < 0) - return retval; + /* + * ranged mandatory locking does not apply to streams - it makes sense + * only for files where position has a meaning. + */ + if (ppos) { + loff_t pos = *ppos; + + if (unlikely(pos < 0)) { + if (!unsigned_offsets(file)) + return retval; + if (count >= -pos) /* both values are in 0..LLONG_MAX */ + return -EOVERFLOW; + } else if (unlikely((loff_t) (pos + count) < 0)) { + if (!unsigned_offsets(file)) + return retval; + } + + if (unlikely(inode->i_flctx && mandatory_lock(inode))) { + retval = locks_mandatory_area(inode, file, pos, pos + count - 1, + read_write == READ ? F_RDLCK : F_WRLCK); + if (retval < 0) + return retval; + } } + return security_file_permission(file, read_write == READ ? MAY_READ : MAY_WRITE); } @@ -400,12 +408,13 @@ static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, lo ssize_t ret; init_sync_kiocb(&kiocb, filp); - kiocb.ki_pos = *ppos; + kiocb.ki_pos = (ppos ? *ppos : 0); iov_iter_init(&iter, READ, &iov, 1, len); ret = call_read_iter(filp, &kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); - *ppos = kiocb.ki_pos; + if (ppos) + *ppos = kiocb.ki_pos; return ret; } @@ -468,12 +477,12 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t ssize_t ret; init_sync_kiocb(&kiocb, filp); - kiocb.ki_pos = *ppos; + kiocb.ki_pos = (ppos ? *ppos : 0); iov_iter_init(&iter, WRITE, &iov, 1, len); ret = call_write_iter(filp, &kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); - if (ret > 0) + if (ret > 0 && ppos) *ppos = kiocb.ki_pos; return ret; } @@ -558,14 +567,10 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_ return ret; } -static inline loff_t file_pos_read(struct file *file) -{ - return file->f_pos; -} - -static inline void file_pos_write(struct file *file, loff_t pos) +/* file_ppos returns &file->f_pos or NULL if file is stream */ +static inline loff_t *file_ppos(struct file *file) { - file->f_pos = pos; + return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos; } ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) @@ -574,10 +579,14 @@ ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) ssize_t ret = -EBADF; if (f.file) { - loff_t pos = file_pos_read(f.file); - ret = vfs_read(f.file, buf, count, &pos); - if (ret >= 0) - file_pos_write(f.file, pos); + loff_t pos, *ppos = file_ppos(f.file); + if (ppos) { + pos = *ppos; + ppos = &pos; + } + ret = vfs_read(f.file, buf, count, ppos); + if (ret >= 0 && ppos) + f.file->f_pos = pos; fdput_pos(f); } return ret; @@ -594,10 +603,14 @@ ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count) ssize_t ret = -EBADF; if (f.file) { - loff_t pos = file_pos_read(f.file); - ret = vfs_write(f.file, buf, count, &pos); - if (ret >= 0) - file_pos_write(f.file, pos); + loff_t pos, *ppos = file_ppos(f.file); + if (ppos) { + pos = *ppos; + ppos = &pos; + } + ret = vfs_write(f.file, buf, count, ppos); + if (ret >= 0 && ppos) + f.file->f_pos = pos; fdput_pos(f); } @@ -672,14 +685,15 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, ret = kiocb_set_rw_flags(&kiocb, flags); if (ret) return ret; - kiocb.ki_pos = *ppos; + kiocb.ki_pos = (ppos ? *ppos : 0); if (type == READ) ret = call_read_iter(filp, &kiocb, iter); else ret = call_write_iter(filp, &kiocb, iter); BUG_ON(ret == -EIOCBQUEUED); - *ppos = kiocb.ki_pos; + if (ppos) + *ppos = kiocb.ki_pos; return ret; } @@ -1012,10 +1026,14 @@ static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, ssize_t ret = -EBADF; if (f.file) { - loff_t pos = file_pos_read(f.file); - ret = vfs_readv(f.file, vec, vlen, &pos, flags); - if (ret >= 0) - file_pos_write(f.file, pos); + loff_t pos, *ppos = file_ppos(f.file); + if (ppos) { + pos = *ppos; + ppos = &pos; + } + ret = vfs_readv(f.file, vec, vlen, ppos, flags); + if (ret >= 0 && ppos) + f.file->f_pos = pos; fdput_pos(f); } @@ -1032,10 +1050,14 @@ static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec, ssize_t ret = -EBADF; if (f.file) { - loff_t pos = file_pos_read(f.file); - ret = vfs_writev(f.file, vec, vlen, &pos, flags); - if (ret >= 0) - file_pos_write(f.file, pos); + loff_t pos, *ppos = file_ppos(f.file); + if (ppos) { + pos = *ppos; + ppos = &pos; + } + ret = vfs_writev(f.file, vec, vlen, ppos, flags); + if (ret >= 0 && ppos) + f.file->f_pos = pos; fdput_pos(f); } diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 1fc934d24459..ab028ea0e561 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -650,17 +650,11 @@ static struct inode *reiserfs_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void reiserfs_i_callback(struct rcu_head *head) +static void reiserfs_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode)); } -static void reiserfs_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, reiserfs_i_callback); -} - static void init_once(void *foo) { struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo; @@ -815,7 +809,7 @@ static struct dquot **reiserfs_get_dquots(struct inode *inode) static const struct super_operations reiserfs_sops = { .alloc_inode = reiserfs_alloc_inode, - .destroy_inode = reiserfs_destroy_inode, + .free_inode = reiserfs_free_inode, .write_inode = reiserfs_write_inode, .dirty_inode = reiserfs_dirty_inode, .evict_inode = reiserfs_evict_inode, diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 6ccb51993a76..7d580f7c3f1d 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -381,18 +381,11 @@ static struct inode *romfs_alloc_inode(struct super_block *sb) /* * return a spent inode to the slab cache */ -static void romfs_i_callback(struct rcu_head *head) +static void romfs_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); - kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode)); } -static void romfs_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, romfs_i_callback); -} - /* * get filesystem statistics */ @@ -439,7 +432,7 @@ static int romfs_remount(struct super_block *sb, int *flags, char *data) static const struct super_operations romfs_super_ops = { .alloc_inode = romfs_alloc_inode, - .destroy_inode = romfs_destroy_inode, + .free_inode = romfs_free_inode, .statfs = romfs_statfs, .remount_fs = romfs_remount, }; diff --git a/fs/seq_file.c b/fs/seq_file.c index 1dea7a8a5255..abe27ec43176 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -317,6 +317,7 @@ loff_t seq_lseek(struct file *file, loff_t offset, int whence) switch (whence) { case SEEK_CUR: offset += file->f_pos; + /* fall through */ case SEEK_SET: if (offset < 0) break; diff --git a/fs/signalfd.c b/fs/signalfd.c index 757afc7c5895..44b6845b071c 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -176,6 +176,7 @@ static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, kernel_siginfo_t *info if (!nonblock) break; ret = -EAGAIN; + /* fall through */ default: spin_unlock_irq(¤t->sighand->siglock); return ret; diff --git a/fs/splice.c b/fs/splice.c index 3ee7e82df48f..25212dcca2df 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -330,8 +330,8 @@ const struct pipe_buf_operations default_pipe_buf_ops = { .get = generic_pipe_buf_get, }; -static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) +int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { return 1; } @@ -1593,7 +1593,11 @@ retry: * Get a reference to this pipe buffer, * so we can copy the contents over. */ - pipe_buf_get(ipipe, ibuf); + if (!pipe_buf_get(ipipe, ibuf)) { + if (ret == 0) + ret = -EFAULT; + break; + } *obuf = *ibuf; /* @@ -1667,7 +1671,11 @@ static int link_pipe(struct pipe_inode_info *ipipe, * Get a reference to this pipe buffer, * so we can copy the contents over. */ - pipe_buf_get(ipipe, ibuf); + if (!pipe_buf_get(ipipe, ibuf)) { + if (ret == 0) + ret = -EFAULT; + break; + } obuf = opipe->bufs + nbuf; *obuf = *ibuf; diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 40e657386fa5..767046d9f65d 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -473,18 +473,11 @@ static struct inode *squashfs_alloc_inode(struct super_block *sb) } -static void squashfs_i_callback(struct rcu_head *head) +static void squashfs_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(squashfs_inode_cachep, squashfs_i(inode)); } -static void squashfs_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, squashfs_i_callback); -} - - static struct file_system_type squashfs_fs_type = { .owner = THIS_MODULE, .name = "squashfs", @@ -496,7 +489,7 @@ MODULE_ALIAS_FS("squashfs"); static const struct super_operations squashfs_super_ops = { .alloc_inode = squashfs_alloc_inode, - .destroy_inode = squashfs_destroy_inode, + .free_inode = squashfs_free_inode, .statfs = squashfs_statfs, .put_super = squashfs_put_super, .remount_fs = squashfs_remount diff --git a/fs/stack.c b/fs/stack.c index a54e33ed10f1..664ed35558bd 100644 --- a/fs/stack.c +++ b/fs/stack.c @@ -21,11 +21,10 @@ void fsstack_copy_inode_size(struct inode *dst, struct inode *src) i_size = i_size_read(src); /* - * But if CONFIG_LBDAF (on 32-bit), we ought to make an effort to - * keep the two halves of i_blocks in sync despite SMP or PREEMPT - - * though stat's generic_fillattr() doesn't bother, and we won't be - * applying quotas (where i_blocks does become important) at the - * upper level. + * But on 32-bit, we ought to make an effort to keep the two halves of + * i_blocks in sync despite SMP or PREEMPT - though stat's + * generic_fillattr() doesn't bother, and we won't be applying quotas + * (where i_blocks does become important) at the upper level. * * We don't actually know what locking is used at the lower level; * but if it's a filesystem that supports quotas, it will be using @@ -44,9 +43,9 @@ void fsstack_copy_inode_size(struct inode *dst, struct inode *src) * include/linux/fs.h). We don't necessarily hold i_mutex when this * is called, so take i_lock for that case. * - * And if CONFIG_LBDAF (on 32-bit), continue our effort to keep the - * two halves of i_blocks in sync despite SMP or PREEMPT: use i_lock - * for that case too, and do both at once by combining the tests. + * And if on 32-bit, continue our effort to keep the two halves of + * i_blocks in sync despite SMP or PREEMPT: use i_lock for that case + * too, and do both at once by combining the tests. * * There is none of this locking overhead in the 64-bit case. */ diff --git a/fs/super.c b/fs/super.c index 583a0124bc39..2739f57515f8 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1467,11 +1467,6 @@ int vfs_get_tree(struct fs_context *fc) struct super_block *sb; int error; - if (fc->fs_type->fs_flags & FS_REQUIRES_DEV && !fc->source) { - errorf(fc, "Filesystem requires source device"); - return -ENOENT; - } - if (fc->root) return -EBUSY; diff --git a/fs/sync.c b/fs/sync.c index b54e0541ad89..01e82170545a 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -234,58 +234,10 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd) return do_fsync(fd, 1); } -/* - * sys_sync_file_range() permits finely controlled syncing over a segment of - * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is - * zero then sys_sync_file_range() will operate from offset out to EOF. - * - * The flag bits are: - * - * SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range - * before performing the write. - * - * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the - * range which are not presently under writeback. Note that this may block for - * significant periods due to exhaustion of disk request structures. - * - * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range - * after performing the write. - * - * Useful combinations of the flag bits are: - * - * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages - * in the range which were dirty on entry to sys_sync_file_range() are placed - * under writeout. This is a start-write-for-data-integrity operation. - * - * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which - * are not presently under writeout. This is an asynchronous flush-to-disk - * operation. Not suitable for data integrity operations. - * - * SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for - * completion of writeout of all pages in the range. This will be used after an - * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait - * for that operation to complete and to return the result. - * - * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER: - * a traditional sync() operation. This is a write-for-data-integrity operation - * which will ensure that all pages in the range which were dirty on entry to - * sys_sync_file_range() are committed to disk. - * - * - * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any - * I/O errors or ENOSPC conditions and will return those to the caller, after - * clearing the EIO and ENOSPC flags in the address_space. - * - * It should be noted that none of these operations write out the file's - * metadata. So unless the application is strictly performing overwrites of - * already-instantiated disk blocks, there are no guarantees here that the data - * will be available after a crash. - */ -int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes, - unsigned int flags) +int sync_file_range(struct file *file, loff_t offset, loff_t nbytes, + unsigned int flags) { int ret; - struct fd f; struct address_space *mapping; loff_t endbyte; /* inclusive */ umode_t i_mode; @@ -325,41 +277,96 @@ int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes, else endbyte--; /* inclusive */ - ret = -EBADF; - f = fdget(fd); - if (!f.file) - goto out; - - i_mode = file_inode(f.file)->i_mode; + i_mode = file_inode(file)->i_mode; ret = -ESPIPE; if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) && !S_ISLNK(i_mode)) - goto out_put; + goto out; - mapping = f.file->f_mapping; + mapping = file->f_mapping; ret = 0; if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) { - ret = file_fdatawait_range(f.file, offset, endbyte); + ret = file_fdatawait_range(file, offset, endbyte); if (ret < 0) - goto out_put; + goto out; } if (flags & SYNC_FILE_RANGE_WRITE) { ret = __filemap_fdatawrite_range(mapping, offset, endbyte, WB_SYNC_NONE); if (ret < 0) - goto out_put; + goto out; } if (flags & SYNC_FILE_RANGE_WAIT_AFTER) - ret = file_fdatawait_range(f.file, offset, endbyte); + ret = file_fdatawait_range(file, offset, endbyte); -out_put: - fdput(f); out: return ret; } +/* + * sys_sync_file_range() permits finely controlled syncing over a segment of + * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is + * zero then sys_sync_file_range() will operate from offset out to EOF. + * + * The flag bits are: + * + * SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range + * before performing the write. + * + * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the + * range which are not presently under writeback. Note that this may block for + * significant periods due to exhaustion of disk request structures. + * + * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range + * after performing the write. + * + * Useful combinations of the flag bits are: + * + * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages + * in the range which were dirty on entry to sys_sync_file_range() are placed + * under writeout. This is a start-write-for-data-integrity operation. + * + * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which + * are not presently under writeout. This is an asynchronous flush-to-disk + * operation. Not suitable for data integrity operations. + * + * SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for + * completion of writeout of all pages in the range. This will be used after an + * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait + * for that operation to complete and to return the result. + * + * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER: + * a traditional sync() operation. This is a write-for-data-integrity operation + * which will ensure that all pages in the range which were dirty on entry to + * sys_sync_file_range() are committed to disk. + * + * + * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any + * I/O errors or ENOSPC conditions and will return those to the caller, after + * clearing the EIO and ENOSPC flags in the address_space. + * + * It should be noted that none of these operations write out the file's + * metadata. So unless the application is strictly performing overwrites of + * already-instantiated disk blocks, there are no guarantees here that the data + * will be available after a crash. + */ +int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes, + unsigned int flags) +{ + int ret; + struct fd f; + + ret = -EBADF; + f = fdget(fd); + if (f.file) + ret = sync_file_range(f.file, offset, nbytes, flags); + + fdput(f); + return ret; +} + SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes, unsigned int, flags) { diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 273736f41be3..02b1d9d0c182 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -313,17 +313,11 @@ static struct inode *sysv_alloc_inode(struct super_block *sb) return &si->vfs_inode; } -static void sysv_i_callback(struct rcu_head *head) +static void sysv_free_in_core_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(sysv_inode_cachep, SYSV_I(inode)); } -static void sysv_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, sysv_i_callback); -} - static void init_once(void *p) { struct sysv_inode_info *si = (struct sysv_inode_info *)p; @@ -333,7 +327,7 @@ static void init_once(void *p) const struct super_operations sysv_sops = { .alloc_inode = sysv_alloc_inode, - .destroy_inode = sysv_destroy_inode, + .free_inode = sysv_free_in_core_inode, .write_inode = sysv_write_inode, .evict_inode = sysv_evict_inode, .put_super = sysv_put_super, diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c index 5bf5fd08879e..b758004085c4 100644 --- a/fs/ubifs/auth.c +++ b/fs/ubifs/auth.c @@ -33,7 +33,6 @@ int __ubifs_node_calc_hash(const struct ubifs_info *c, const void *node, int err; shash->tfm = c->hash_tfm; - shash->flags = CRYPTO_TFM_REQ_MAY_SLEEP; err = crypto_shash_digest(shash, node, le32_to_cpu(ch->len), hash); if (err < 0) @@ -56,7 +55,6 @@ static int ubifs_hash_calc_hmac(const struct ubifs_info *c, const u8 *hash, int err; shash->tfm = c->hmac_tfm; - shash->flags = CRYPTO_TFM_REQ_MAY_SLEEP; err = crypto_shash_digest(shash, hash, c->hash_len, hmac); if (err < 0) @@ -88,7 +86,6 @@ int ubifs_prepare_auth_node(struct ubifs_info *c, void *node, return -ENOMEM; hash_desc->tfm = c->hash_tfm; - hash_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; ubifs_shash_copy_state(c, inhash, hash_desc); err = crypto_shash_final(hash_desc, hash); @@ -123,7 +120,6 @@ static struct shash_desc *ubifs_get_desc(const struct ubifs_info *c, return ERR_PTR(-ENOMEM); desc->tfm = tfm; - desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; err = crypto_shash_init(desc); if (err) { @@ -364,7 +360,6 @@ static int ubifs_node_calc_hmac(const struct ubifs_info *c, const void *node, ubifs_assert(c, ofs_hmac + hmac_len < len); shash->tfm = c->hmac_tfm; - shash->flags = CRYPTO_TFM_REQ_MAY_SLEEP; err = crypto_shash_init(shash); if (err) @@ -483,7 +478,6 @@ int ubifs_hmac_wkm(struct ubifs_info *c, u8 *hmac) return 0; shash->tfm = c->hmac_tfm; - shash->flags = CRYPTO_TFM_REQ_MAY_SLEEP; err = crypto_shash_init(shash); if (err) diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index 0a0e65c07c6d..5c8a81a019a4 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -576,7 +576,6 @@ static int authenticate_sleb_hash(struct ubifs_info *c, struct shash_desc *log_h SHASH_DESC_ON_STACK(hash_desc, c->hash_tfm); hash_desc->tfm = c->hash_tfm; - hash_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; ubifs_shash_copy_state(c, log_hash, hash_desc); return crypto_shash_final(hash_desc, hash); @@ -587,7 +586,6 @@ static int authenticate_sleb_hmac(struct ubifs_info *c, u8 *hash, u8 *hmac) SHASH_DESC_ON_STACK(hmac_desc, c->hmac_tfm); hmac_desc->tfm = c->hmac_tfm; - hmac_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; return crypto_shash_digest(hmac_desc, hash, c->hash_len, hmac); } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 8dc2818fdd84..c2307c423638 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -272,19 +272,11 @@ static struct inode *ubifs_alloc_inode(struct super_block *sb) return &ui->vfs_inode; }; -static void ubifs_i_callback(struct rcu_head *head) +static void ubifs_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); struct ubifs_inode *ui = ubifs_inode(inode); - kmem_cache_free(ubifs_inode_slab, ui); -} - -static void ubifs_destroy_inode(struct inode *inode) -{ - struct ubifs_inode *ui = ubifs_inode(inode); - kfree(ui->data); - call_rcu(&inode->i_rcu, ubifs_i_callback); + kmem_cache_free(ubifs_inode_slab, ui); } /* @@ -1979,7 +1971,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) const struct super_operations ubifs_super_operations = { .alloc_inode = ubifs_alloc_inode, - .destroy_inode = ubifs_destroy_inode, + .free_inode = ubifs_free_inode, .put_super = ubifs_put_super, .write_inode = ubifs_write_inode, .evict_inode = ubifs_evict_inode, diff --git a/fs/udf/super.c b/fs/udf/super.c index ffd8038ff728..f64691f2168a 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -161,17 +161,11 @@ static struct inode *udf_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void udf_i_callback(struct rcu_head *head) +static void udf_free_in_core_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(udf_inode_cachep, UDF_I(inode)); } -static void udf_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, udf_i_callback); -} - static void init_once(void *foo) { struct udf_inode_info *ei = (struct udf_inode_info *)foo; @@ -206,7 +200,7 @@ static void destroy_inodecache(void) /* Superblock operations */ static const struct super_operations udf_sb_ops = { .alloc_inode = udf_alloc_inode, - .destroy_inode = udf_destroy_inode, + .free_inode = udf_free_in_core_inode, .write_inode = udf_write_inode, .evict_inode = udf_evict_inode, .put_super = udf_put_super, diff --git a/fs/ufs/super.c b/fs/ufs/super.c index a4e07e910f1b..84c0c5178cd2 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1449,17 +1449,11 @@ static struct inode *ufs_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void ufs_i_callback(struct rcu_head *head) +static void ufs_free_in_core_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(ufs_inode_cachep, UFS_I(inode)); } -static void ufs_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, ufs_i_callback); -} - static void init_once(void *foo) { struct ufs_inode_info *ei = (struct ufs_inode_info *) foo; @@ -1494,7 +1488,7 @@ static void destroy_inodecache(void) static const struct super_operations ufs_super_ops = { .alloc_inode = ufs_alloc_inode, - .destroy_inode = ufs_destroy_inode, + .free_inode = ufs_free_in_core_inode, .write_inode = ufs_write_inode, .evict_inode = ufs_evict_inode, .put_super = ufs_put_super, diff --git a/fs/ufs/util.h b/fs/ufs/util.h index 1fd3011ea623..e1f1b2e868a7 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -42,7 +42,7 @@ ufs_get_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1, case UFS_ST_SUNOS: if (fs32_to_cpu(sb, usb3->fs_postblformat) == UFS_42POSTBLFMT) return fs32_to_cpu(sb, usb1->fs_u0.fs_sun.fs_state); - /* Fall Through to UFS_ST_SUN */ + /* Fall Through - to UFS_ST_SUN */ case UFS_ST_SUN: return fs32_to_cpu(sb, usb3->fs_un2.fs_sun.fs_state); case UFS_ST_SUNx86: @@ -63,7 +63,7 @@ ufs_set_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1, usb1->fs_u0.fs_sun.fs_state = cpu_to_fs32(sb, value); break; } - /* Fall Through to UFS_ST_SUN */ + /* Fall Through - to UFS_ST_SUN */ case UFS_ST_SUN: usb3->fs_un2.fs_sun.fs_state = cpu_to_fs32(sb, value); break; @@ -229,7 +229,7 @@ ufs_get_inode_gid(struct super_block *sb, struct ufs_inode *inode) case UFS_UID_44BSD: return fs32_to_cpu(sb, inode->ui_u3.ui_44.ui_gid); case UFS_UID_EFT: - if (inode->ui_u1.oldids.ui_suid == 0xFFFF) + if (inode->ui_u1.oldids.ui_sgid == 0xFFFF) return fs32_to_cpu(sb, inode->ui_u3.ui_sun.ui_gid); /* Fall through */ default: diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 89800fc7dc9d..f5de1e726356 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -629,6 +629,8 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, /* the various vma->vm_userfaultfd_ctx still points to it */ down_write(&mm->mmap_sem); + /* no task can run (and in turn coredump) yet */ + VM_WARN_ON(!mmget_still_valid(mm)); for (vma = mm->mmap; vma; vma = vma->vm_next) if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; @@ -883,6 +885,8 @@ static int userfaultfd_release(struct inode *inode, struct file *file) * taking the mmap_sem for writing. */ down_write(&mm->mmap_sem); + if (!mmget_still_valid(mm)) + goto skip_mm; prev = NULL; for (vma = mm->mmap; vma; vma = vma->vm_next) { cond_resched(); @@ -905,6 +909,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) vma->vm_flags = new_flags; vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } +skip_mm: up_write(&mm->mmap_sem); mmput(mm); wakeup: @@ -1333,6 +1338,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, goto out; down_write(&mm->mmap_sem); + if (!mmget_still_valid(mm)) + goto out_unlock; vma = find_vma_prev(mm, start, &prev); if (!vma) goto out_unlock; @@ -1520,6 +1527,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto out; down_write(&mm->mmap_sem); + if (!mmget_still_valid(mm)) + goto out_unlock; vma = find_vma_prev(mm, start, &prev); if (!vma) goto out_unlock; diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 457ac9f97377..99af5e5bda9f 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -1,7 +1,6 @@ config XFS_FS tristate "XFS filesystem support" depends on BLOCK - depends on (64BIT || LBDAF) select EXPORTFS select LIBCRC32C select FS_IOMAP diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 7f96bdadc372..1dfc6df2e2bd 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -73,6 +73,7 @@ xfs-y += xfs_aops.o \ xfs_fsmap.o \ xfs_fsops.o \ xfs_globals.o \ + xfs_health.o \ xfs_icache.o \ xfs_ioctl.o \ xfs_iomap.o \ @@ -142,6 +143,8 @@ xfs-y += $(addprefix scrub/, \ common.o \ dabtree.o \ dir.o \ + fscounters.o \ + health.o \ ialloc.o \ inode.o \ parent.o \ diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 1ef8acf35e7d..b0c89f54d1bb 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -19,6 +19,8 @@ #include "xfs_ialloc.h" #include "xfs_rmap.h" #include "xfs_ag.h" +#include "xfs_ag_resv.h" +#include "xfs_health.h" static struct xfs_buf * xfs_get_aghdr_buf( @@ -461,3 +463,55 @@ xfs_ag_extend_space( len, &XFS_RMAP_OINFO_SKIP_UPDATE, XFS_AG_RESV_NONE); } + +/* Retrieve AG geometry. */ +int +xfs_ag_get_geometry( + struct xfs_mount *mp, + xfs_agnumber_t agno, + struct xfs_ag_geometry *ageo) +{ + struct xfs_buf *agi_bp; + struct xfs_buf *agf_bp; + struct xfs_agi *agi; + struct xfs_agf *agf; + struct xfs_perag *pag; + unsigned int freeblks; + int error; + + if (agno >= mp->m_sb.sb_agcount) + return -EINVAL; + + /* Lock the AG headers. */ + error = xfs_ialloc_read_agi(mp, NULL, agno, &agi_bp); + if (error) + return error; + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agf_bp); + if (error) + goto out_agi; + pag = xfs_perag_get(mp, agno); + + /* Fill out form. */ + memset(ageo, 0, sizeof(*ageo)); + ageo->ag_number = agno; + + agi = XFS_BUF_TO_AGI(agi_bp); + ageo->ag_icount = be32_to_cpu(agi->agi_count); + ageo->ag_ifree = be32_to_cpu(agi->agi_freecount); + + agf = XFS_BUF_TO_AGF(agf_bp); + ageo->ag_length = be32_to_cpu(agf->agf_length); + freeblks = pag->pagf_freeblks + + pag->pagf_flcount + + pag->pagf_btreeblks - + xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE); + ageo->ag_freeblks = freeblks; + xfs_ag_geom_health(pag, ageo); + + /* Release resources. */ + xfs_perag_put(pag); + xfs_buf_relse(agf_bp); +out_agi: + xfs_buf_relse(agi_bp); + return error; +} diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 412702e23f61..5166322807e7 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -26,5 +26,7 @@ struct aghdr_init_data { int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id); int xfs_ag_extend_space(struct xfs_mount *mp, struct xfs_trans *tp, struct aghdr_init_data *id, xfs_extlen_t len); +int xfs_ag_get_geometry(struct xfs_mount *mp, xfs_agnumber_t agno, + struct xfs_ag_geometry *ageo); #endif /* __LIBXFS_AG_H */ diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index bc3367b8b7bb..a9ff3cf82cce 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2042,6 +2042,7 @@ xfs_alloc_space_available( xfs_extlen_t alloc_len, longest; xfs_extlen_t reservation; /* blocks that are still reserved */ int available; + xfs_extlen_t agflcount; if (flags & XFS_ALLOC_FLAG_FREEING) return true; @@ -2054,8 +2055,13 @@ xfs_alloc_space_available( if (longest < alloc_len) return false; - /* do we have enough free space remaining for the allocation? */ - available = (int)(pag->pagf_freeblks + pag->pagf_flcount - + /* + * Do we have enough free space remaining for the allocation? Don't + * account extra agfl blocks because we are about to defer free them, + * making them unavailable until the current transaction commits. + */ + agflcount = min_t(xfs_extlen_t, pag->pagf_flcount, min_free); + available = (int)(pag->pagf_freeblks + agflcount - reservation - min_free - args->minleft); if (available < (int)max(args->total, alloc_len)) return false; @@ -2237,6 +2243,9 @@ xfs_alloc_fix_freelist( xfs_extlen_t need; /* total blocks needed in freelist */ int error = 0; + /* deferred ops (AGFL block frees) require permanent transactions */ + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + if (!pag->pagf_init) { error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp); if (error) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 2dd9ee2a2e08..c441f41f14e8 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -224,10 +224,10 @@ xfs_attr_try_sf_addname( */ int xfs_attr_set_args( - struct xfs_da_args *args, - struct xfs_buf **leaf_bp) + struct xfs_da_args *args) { struct xfs_inode *dp = args->dp; + struct xfs_buf *leaf_bp = NULL; int error; /* @@ -255,7 +255,7 @@ xfs_attr_set_args( * It won't fit in the shortform, transform to a leaf block. * GROT: another possible req'mt for a double-split btree op. */ - error = xfs_attr_shortform_to_leaf(args, leaf_bp); + error = xfs_attr_shortform_to_leaf(args, &leaf_bp); if (error) return error; @@ -263,23 +263,16 @@ xfs_attr_set_args( * Prevent the leaf buffer from being unlocked so that a * concurrent AIL push cannot grab the half-baked leaf * buffer and run into problems with the write verifier. + * Once we're done rolling the transaction we can release + * the hold and add the attr to the leaf. */ - xfs_trans_bhold(args->trans, *leaf_bp); - + xfs_trans_bhold(args->trans, leaf_bp); error = xfs_defer_finish(&args->trans); - if (error) - return error; - - /* - * Commit the leaf transformation. We'll need another - * (linked) transaction to add the new attribute to the - * leaf. - */ - error = xfs_trans_roll_inode(&args->trans, dp); - if (error) + xfs_trans_bhold_release(args->trans, leaf_bp); + if (error) { + xfs_trans_brelse(args->trans, leaf_bp); return error; - xfs_trans_bjoin(args->trans, *leaf_bp); - *leaf_bp = NULL; + } } if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) @@ -322,7 +315,6 @@ xfs_attr_set( int flags) { struct xfs_mount *mp = dp->i_mount; - struct xfs_buf *leaf_bp = NULL; struct xfs_da_args args; struct xfs_trans_res tres; int rsvd = (flags & ATTR_ROOT) != 0; @@ -381,9 +373,9 @@ xfs_attr_set( goto out_trans_cancel; xfs_trans_ijoin(args.trans, dp, 0); - error = xfs_attr_set_args(&args, &leaf_bp); + error = xfs_attr_set_args(&args); if (error) - goto out_release_leaf; + goto out_trans_cancel; if (!args.trans) { /* shortform attribute has already been committed */ goto out_unlock; @@ -408,9 +400,6 @@ out_unlock: xfs_iunlock(dp, XFS_ILOCK_EXCL); return error; -out_release_leaf: - if (leaf_bp) - xfs_trans_brelse(args.trans, leaf_bp); out_trans_cancel: if (args.trans) xfs_trans_cancel(args.trans); diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 2297d8467666..3b0dce06e454 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -140,7 +140,7 @@ int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name, unsigned char *value, int *valuelenp, int flags); int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, unsigned char *value, int valuelen, int flags); -int xfs_attr_set_args(struct xfs_da_args *args, struct xfs_buf **leaf_bp); +int xfs_attr_set_args(struct xfs_da_args *args); int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); int xfs_attr_remove_args(struct xfs_da_args *args); int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 4637ae1ae91c..356ebd1cbe82 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -2009,6 +2009,9 @@ xfs_bmap_add_extent_delay_real( goto done; } + if (da_new != da_old) + xfs_mod_delalloc(mp, (int64_t)da_new - da_old); + if (bma->cur) { da_new += bma->cur->bc_private.b.allocated; bma->cur->bc_private.b.allocated = 0; @@ -2640,6 +2643,7 @@ xfs_bmap_add_extent_hole_delay( /* * Nothing to do for disk quota accounting here. */ + xfs_mod_delalloc(ip->i_mount, (int64_t)newlen - oldlen); } } @@ -3352,8 +3356,10 @@ xfs_bmap_btalloc_accounting( * already have quota reservation and there's nothing to do * yet. */ - if (ap->wasdel) + if (ap->wasdel) { + xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)args->len); return; + } /* * Otherwise, we've allocated blocks in a hole. The transaction @@ -3372,8 +3378,10 @@ xfs_bmap_btalloc_accounting( /* data/attr fork only */ ap->ip->i_d.di_nblocks += args->len; xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); - if (ap->wasdel) + if (ap->wasdel) { ap->ip->i_delayed_blks -= args->len; + xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)args->len); + } xfs_trans_mod_dquot_byino(ap->tp, ap->ip, ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT : XFS_TRANS_DQ_BCOUNT, args->len); @@ -3969,6 +3977,7 @@ xfs_bmapi_reserve_delalloc( ip->i_delayed_blks += alen; + xfs_mod_delalloc(ip->i_mount, alen + indlen); got->br_startoff = aoff; got->br_startblock = nullstartblock(indlen); @@ -4840,8 +4849,10 @@ xfs_bmap_del_extent_delay( da_diff = da_old - da_new; if (!isrt) da_diff += del->br_blockcount; - if (da_diff) + if (da_diff) { xfs_mod_fdblocks(mp, da_diff, false); + xfs_mod_delalloc(mp, -da_diff); + } return error; } diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 94f00427de98..1c6bf2105939 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -274,13 +274,15 @@ xfs_defer_trans_roll( trace_xfs_defer_trans_roll(tp, _RET_IP_); - /* Roll the transaction. */ + /* + * Roll the transaction. Rolling always given a new transaction (even + * if committing the old one fails!) to hand back to the caller, so we + * join the held resources to the new transaction so that we always + * return with the held resources joined to @tpp, no matter what + * happened. + */ error = xfs_trans_roll(tpp); tp = *tpp; - if (error) { - trace_xfs_defer_trans_roll_error(tp, error); - return error; - } /* Rejoin the joined inodes. */ for (i = 0; i < ipcount; i++) @@ -292,6 +294,8 @@ xfs_defer_trans_roll( xfs_trans_bhold(tp, bplist[i]); } + if (error) + trace_xfs_defer_trans_roll_error(tp, error); return error; } diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index fb5bd9a804f6..88fa11071f9f 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -110,7 +110,7 @@ xfs_dqblk_verify( /* * Do some primitive error checking on ondisk dquot data structures. */ -int +void xfs_dqblk_repair( struct xfs_mount *mp, struct xfs_dqblk *dqb, @@ -133,8 +133,6 @@ xfs_dqblk_repair( xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); } - - return 0; } STATIC bool diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index f3aa59302fef..e7382c780ed7 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -124,7 +124,7 @@ typedef struct xfs_flock64 { /* * Output for XFS_IOC_FSGEOMETRY_V1 */ -typedef struct xfs_fsop_geom_v1 { +struct xfs_fsop_geom_v1 { __u32 blocksize; /* filesystem (data) block size */ __u32 rtextsize; /* realtime extent size */ __u32 agblocks; /* fsblocks in an AG */ @@ -145,12 +145,39 @@ typedef struct xfs_fsop_geom_v1 { __u32 logsectsize; /* log sector size, bytes */ __u32 rtsectsize; /* realtime sector size, bytes */ __u32 dirblocksize; /* directory block size, bytes */ -} xfs_fsop_geom_v1_t; +}; + +/* + * Output for XFS_IOC_FSGEOMETRY_V4 + */ +struct xfs_fsop_geom_v4 { + __u32 blocksize; /* filesystem (data) block size */ + __u32 rtextsize; /* realtime extent size */ + __u32 agblocks; /* fsblocks in an AG */ + __u32 agcount; /* number of allocation groups */ + __u32 logblocks; /* fsblocks in the log */ + __u32 sectsize; /* (data) sector size, bytes */ + __u32 inodesize; /* inode size in bytes */ + __u32 imaxpct; /* max allowed inode space(%) */ + __u64 datablocks; /* fsblocks in data subvolume */ + __u64 rtblocks; /* fsblocks in realtime subvol */ + __u64 rtextents; /* rt extents in realtime subvol*/ + __u64 logstart; /* starting fsblock of the log */ + unsigned char uuid[16]; /* unique id of the filesystem */ + __u32 sunit; /* stripe unit, fsblocks */ + __u32 swidth; /* stripe width, fsblocks */ + __s32 version; /* structure version */ + __u32 flags; /* superblock version flags */ + __u32 logsectsize; /* log sector size, bytes */ + __u32 rtsectsize; /* realtime sector size, bytes */ + __u32 dirblocksize; /* directory block size, bytes */ + __u32 logsunit; /* log stripe unit, bytes */ +}; /* * Output for XFS_IOC_FSGEOMETRY */ -typedef struct xfs_fsop_geom { +struct xfs_fsop_geom { __u32 blocksize; /* filesystem (data) block size */ __u32 rtextsize; /* realtime extent size */ __u32 agblocks; /* fsblocks in an AG */ @@ -171,8 +198,18 @@ typedef struct xfs_fsop_geom { __u32 logsectsize; /* log sector size, bytes */ __u32 rtsectsize; /* realtime sector size, bytes */ __u32 dirblocksize; /* directory block size, bytes */ - __u32 logsunit; /* log stripe unit, bytes */ -} xfs_fsop_geom_t; + __u32 logsunit; /* log stripe unit, bytes */ + uint32_t sick; /* o: unhealthy fs & rt metadata */ + uint32_t checked; /* o: checked fs & rt metadata */ + __u64 reserved[17]; /* reserved space */ +}; + +#define XFS_FSOP_GEOM_SICK_COUNTERS (1 << 0) /* summary counters */ +#define XFS_FSOP_GEOM_SICK_UQUOTA (1 << 1) /* user quota */ +#define XFS_FSOP_GEOM_SICK_GQUOTA (1 << 2) /* group quota */ +#define XFS_FSOP_GEOM_SICK_PQUOTA (1 << 3) /* project quota */ +#define XFS_FSOP_GEOM_SICK_RT_BITMAP (1 << 4) /* realtime bitmap */ +#define XFS_FSOP_GEOM_SICK_RT_SUMMARY (1 << 5) /* realtime summary */ /* Output for XFS_FS_COUNTS */ typedef struct xfs_fsop_counts { @@ -188,28 +225,30 @@ typedef struct xfs_fsop_resblks { __u64 resblks_avail; } xfs_fsop_resblks_t; -#define XFS_FSOP_GEOM_VERSION 0 - -#define XFS_FSOP_GEOM_FLAGS_ATTR 0x0001 /* attributes in use */ -#define XFS_FSOP_GEOM_FLAGS_NLINK 0x0002 /* 32-bit nlink values */ -#define XFS_FSOP_GEOM_FLAGS_QUOTA 0x0004 /* quotas enabled */ -#define XFS_FSOP_GEOM_FLAGS_IALIGN 0x0008 /* inode alignment */ -#define XFS_FSOP_GEOM_FLAGS_DALIGN 0x0010 /* large data alignment */ -#define XFS_FSOP_GEOM_FLAGS_SHARED 0x0020 /* read-only shared */ -#define XFS_FSOP_GEOM_FLAGS_EXTFLG 0x0040 /* special extent flag */ -#define XFS_FSOP_GEOM_FLAGS_DIRV2 0x0080 /* directory version 2 */ -#define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */ -#define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */ -#define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */ -#define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */ -#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */ -#define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */ -#define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */ -#define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */ -#define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */ -#define XFS_FSOP_GEOM_FLAGS_SPINODES 0x40000 /* sparse inode chunks */ -#define XFS_FSOP_GEOM_FLAGS_RMAPBT 0x80000 /* reverse mapping btree */ -#define XFS_FSOP_GEOM_FLAGS_REFLINK 0x100000 /* files can share blocks */ +#define XFS_FSOP_GEOM_VERSION 0 +#define XFS_FSOP_GEOM_VERSION_V5 5 + +#define XFS_FSOP_GEOM_FLAGS_ATTR (1 << 0) /* attributes in use */ +#define XFS_FSOP_GEOM_FLAGS_NLINK (1 << 1) /* 32-bit nlink values */ +#define XFS_FSOP_GEOM_FLAGS_QUOTA (1 << 2) /* quotas enabled */ +#define XFS_FSOP_GEOM_FLAGS_IALIGN (1 << 3) /* inode alignment */ +#define XFS_FSOP_GEOM_FLAGS_DALIGN (1 << 4) /* large data alignment */ +#define XFS_FSOP_GEOM_FLAGS_SHARED (1 << 5) /* read-only shared */ +#define XFS_FSOP_GEOM_FLAGS_EXTFLG (1 << 6) /* special extent flag */ +#define XFS_FSOP_GEOM_FLAGS_DIRV2 (1 << 7) /* directory version 2 */ +#define XFS_FSOP_GEOM_FLAGS_LOGV2 (1 << 8) /* log format version 2 */ +#define XFS_FSOP_GEOM_FLAGS_SECTOR (1 << 9) /* sector sizes >1BB */ +#define XFS_FSOP_GEOM_FLAGS_ATTR2 (1 << 10) /* inline attributes rework */ +#define XFS_FSOP_GEOM_FLAGS_PROJID32 (1 << 11) /* 32-bit project IDs */ +#define XFS_FSOP_GEOM_FLAGS_DIRV2CI (1 << 12) /* ASCII only CI names */ + /* -- Do not use -- (1 << 13) SGI parent pointers */ +#define XFS_FSOP_GEOM_FLAGS_LAZYSB (1 << 14) /* lazy superblock counters */ +#define XFS_FSOP_GEOM_FLAGS_V5SB (1 << 15) /* version 5 superblock */ +#define XFS_FSOP_GEOM_FLAGS_FTYPE (1 << 16) /* inode directory types */ +#define XFS_FSOP_GEOM_FLAGS_FINOBT (1 << 17) /* free inode btree */ +#define XFS_FSOP_GEOM_FLAGS_SPINODES (1 << 18) /* sparse inode chunks */ +#define XFS_FSOP_GEOM_FLAGS_RMAPBT (1 << 19) /* reverse mapping btree */ +#define XFS_FSOP_GEOM_FLAGS_REFLINK (1 << 20) /* files can share blocks */ /* * Minimum and maximum sizes need for growth checks. @@ -238,6 +277,31 @@ typedef struct xfs_fsop_resblks { (s)->sb_agblocks + XFS_MIN_AG_BLOCKS) /* + * Output for XFS_IOC_AG_GEOMETRY + */ +struct xfs_ag_geometry { + uint32_t ag_number; /* i/o: AG number */ + uint32_t ag_length; /* o: length in blocks */ + uint32_t ag_freeblks; /* o: free space */ + uint32_t ag_icount; /* o: inodes allocated */ + uint32_t ag_ifree; /* o: inodes free */ + uint32_t ag_sick; /* o: sick things in ag */ + uint32_t ag_checked; /* o: checked metadata in ag */ + uint32_t ag_reserved32; /* o: zero */ + uint64_t ag_reserved[12];/* o: zero */ +}; +#define XFS_AG_GEOM_SICK_SB (1 << 0) /* superblock */ +#define XFS_AG_GEOM_SICK_AGF (1 << 1) /* AGF header */ +#define XFS_AG_GEOM_SICK_AGFL (1 << 2) /* AGFL header */ +#define XFS_AG_GEOM_SICK_AGI (1 << 3) /* AGI header */ +#define XFS_AG_GEOM_SICK_BNOBT (1 << 4) /* free space by block */ +#define XFS_AG_GEOM_SICK_CNTBT (1 << 5) /* free space by length */ +#define XFS_AG_GEOM_SICK_INOBT (1 << 6) /* inode index */ +#define XFS_AG_GEOM_SICK_FINOBT (1 << 7) /* free inode index */ +#define XFS_AG_GEOM_SICK_RMAPBT (1 << 8) /* reverse mappings */ +#define XFS_AG_GEOM_SICK_REFCNTBT (1 << 9) /* reference counts */ + +/* * Structures for XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG & XFS_IOC_FSGROWFSRT */ typedef struct xfs_growfs_data { @@ -285,13 +349,25 @@ typedef struct xfs_bstat { #define bs_projid bs_projid_lo /* (previously just bs_projid) */ __u16 bs_forkoff; /* inode fork offset in bytes */ __u16 bs_projid_hi; /* higher part of project id */ - unsigned char bs_pad[6]; /* pad space, unused */ + uint16_t bs_sick; /* sick inode metadata */ + uint16_t bs_checked; /* checked inode metadata */ + unsigned char bs_pad[2]; /* pad space, unused */ __u32 bs_cowextsize; /* cow extent size */ __u32 bs_dmevmask; /* DMIG event mask */ __u16 bs_dmstate; /* DMIG state info */ __u16 bs_aextents; /* attribute number of extents */ } xfs_bstat_t; +/* bs_sick flags */ +#define XFS_BS_SICK_INODE (1 << 0) /* inode core */ +#define XFS_BS_SICK_BMBTD (1 << 1) /* data fork */ +#define XFS_BS_SICK_BMBTA (1 << 2) /* attr fork */ +#define XFS_BS_SICK_BMBTC (1 << 3) /* cow fork */ +#define XFS_BS_SICK_DIR (1 << 4) /* directory */ +#define XFS_BS_SICK_XATTR (1 << 5) /* extended attributes */ +#define XFS_BS_SICK_SYMLINK (1 << 6) /* symbolic link remote target */ +#define XFS_BS_SICK_PARENT (1 << 7) /* parent pointers */ + /* * Project quota id helpers (previously projid was 16bit only * and using two 16bit values to hold new 32bit projid was choosen @@ -502,9 +578,10 @@ struct xfs_scrub_metadata { #define XFS_SCRUB_TYPE_UQUOTA 21 /* user quotas */ #define XFS_SCRUB_TYPE_GQUOTA 22 /* group quotas */ #define XFS_SCRUB_TYPE_PQUOTA 23 /* project quotas */ +#define XFS_SCRUB_TYPE_FSCOUNTERS 24 /* fs summary counters */ /* Number of scrub subcommands. */ -#define XFS_SCRUB_TYPE_NR 24 +#define XFS_SCRUB_TYPE_NR 25 /* i: Repair this metadata. */ #define XFS_SCRUB_IFLAG_REPAIR (1 << 0) @@ -590,6 +667,7 @@ struct xfs_scrub_metadata { #define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_fs_eofblocks) /* XFS_IOC_GETFSMAP ------ hoisted 59 */ #define XFS_IOC_SCRUB_METADATA _IOWR('X', 60, struct xfs_scrub_metadata) +#define XFS_IOC_AG_GEOMETRY _IOWR('X', 61, struct xfs_ag_geometry) /* * ioctl commands that replace IRIX syssgi()'s @@ -620,8 +698,9 @@ struct xfs_scrub_metadata { #define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq) #define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq) #define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq) -#define XFS_IOC_FSGEOMETRY _IOR ('X', 124, struct xfs_fsop_geom) +#define XFS_IOC_FSGEOMETRY_V4 _IOR ('X', 124, struct xfs_fsop_geom_v4) #define XFS_IOC_GOINGDOWN _IOR ('X', 125, uint32_t) +#define XFS_IOC_FSGEOMETRY _IOR ('X', 126, struct xfs_fsop_geom) /* XFS_IOC_GETFSUUID ---------- deprecated 140 */ diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h new file mode 100644 index 000000000000..49ddfeac19f2 --- /dev/null +++ b/fs/xfs/libxfs/xfs_health.h @@ -0,0 +1,190 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2019 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#ifndef __XFS_HEALTH_H__ +#define __XFS_HEALTH_H__ + +/* + * In-Core Filesystem Health Assessments + * ===================================== + * + * We'd like to be able to summarize the current health status of the + * filesystem so that the administrator knows when it's necessary to schedule + * some downtime for repairs. Until then, we would also like to avoid abrupt + * shutdowns due to corrupt metadata. + * + * The online scrub feature evaluates the health of all filesystem metadata. + * When scrub detects corruption in a piece of metadata it will set the + * corresponding sickness flag, and repair will clear it if successful. If + * problems remain at unmount time, we can also request manual intervention by + * logging a notice to run xfs_repair. + * + * Each health tracking group uses a pair of fields for reporting. The + * "checked" field tell us if a given piece of metadata has ever been examined, + * and the "sick" field tells us if that piece was found to need repairs. + * Therefore we can conclude that for a given sick flag value: + * + * - checked && sick => metadata needs repair + * - checked && !sick => metadata is ok + * - !checked => has not been examined since mount + */ + +struct xfs_mount; +struct xfs_perag; +struct xfs_inode; +struct xfs_fsop_geom; + +/* Observable health issues for metadata spanning the entire filesystem. */ +#define XFS_SICK_FS_COUNTERS (1 << 0) /* summary counters */ +#define XFS_SICK_FS_UQUOTA (1 << 1) /* user quota */ +#define XFS_SICK_FS_GQUOTA (1 << 2) /* group quota */ +#define XFS_SICK_FS_PQUOTA (1 << 3) /* project quota */ + +/* Observable health issues for realtime volume metadata. */ +#define XFS_SICK_RT_BITMAP (1 << 0) /* realtime bitmap */ +#define XFS_SICK_RT_SUMMARY (1 << 1) /* realtime summary */ + +/* Observable health issues for AG metadata. */ +#define XFS_SICK_AG_SB (1 << 0) /* superblock */ +#define XFS_SICK_AG_AGF (1 << 1) /* AGF header */ +#define XFS_SICK_AG_AGFL (1 << 2) /* AGFL header */ +#define XFS_SICK_AG_AGI (1 << 3) /* AGI header */ +#define XFS_SICK_AG_BNOBT (1 << 4) /* free space by block */ +#define XFS_SICK_AG_CNTBT (1 << 5) /* free space by length */ +#define XFS_SICK_AG_INOBT (1 << 6) /* inode index */ +#define XFS_SICK_AG_FINOBT (1 << 7) /* free inode index */ +#define XFS_SICK_AG_RMAPBT (1 << 8) /* reverse mappings */ +#define XFS_SICK_AG_REFCNTBT (1 << 9) /* reference counts */ + +/* Observable health issues for inode metadata. */ +#define XFS_SICK_INO_CORE (1 << 0) /* inode core */ +#define XFS_SICK_INO_BMBTD (1 << 1) /* data fork */ +#define XFS_SICK_INO_BMBTA (1 << 2) /* attr fork */ +#define XFS_SICK_INO_BMBTC (1 << 3) /* cow fork */ +#define XFS_SICK_INO_DIR (1 << 4) /* directory */ +#define XFS_SICK_INO_XATTR (1 << 5) /* extended attributes */ +#define XFS_SICK_INO_SYMLINK (1 << 6) /* symbolic link remote target */ +#define XFS_SICK_INO_PARENT (1 << 7) /* parent pointers */ + +/* Primary evidence of health problems in a given group. */ +#define XFS_SICK_FS_PRIMARY (XFS_SICK_FS_COUNTERS | \ + XFS_SICK_FS_UQUOTA | \ + XFS_SICK_FS_GQUOTA | \ + XFS_SICK_FS_PQUOTA) + +#define XFS_SICK_RT_PRIMARY (XFS_SICK_RT_BITMAP | \ + XFS_SICK_RT_SUMMARY) + +#define XFS_SICK_AG_PRIMARY (XFS_SICK_AG_SB | \ + XFS_SICK_AG_AGF | \ + XFS_SICK_AG_AGFL | \ + XFS_SICK_AG_AGI | \ + XFS_SICK_AG_BNOBT | \ + XFS_SICK_AG_CNTBT | \ + XFS_SICK_AG_INOBT | \ + XFS_SICK_AG_FINOBT | \ + XFS_SICK_AG_RMAPBT | \ + XFS_SICK_AG_REFCNTBT) + +#define XFS_SICK_INO_PRIMARY (XFS_SICK_INO_CORE | \ + XFS_SICK_INO_BMBTD | \ + XFS_SICK_INO_BMBTA | \ + XFS_SICK_INO_BMBTC | \ + XFS_SICK_INO_DIR | \ + XFS_SICK_INO_XATTR | \ + XFS_SICK_INO_SYMLINK | \ + XFS_SICK_INO_PARENT) + +/* These functions must be provided by the xfs implementation. */ + +void xfs_fs_mark_sick(struct xfs_mount *mp, unsigned int mask); +void xfs_fs_mark_healthy(struct xfs_mount *mp, unsigned int mask); +void xfs_fs_measure_sickness(struct xfs_mount *mp, unsigned int *sick, + unsigned int *checked); + +void xfs_rt_mark_sick(struct xfs_mount *mp, unsigned int mask); +void xfs_rt_mark_healthy(struct xfs_mount *mp, unsigned int mask); +void xfs_rt_measure_sickness(struct xfs_mount *mp, unsigned int *sick, + unsigned int *checked); + +void xfs_ag_mark_sick(struct xfs_perag *pag, unsigned int mask); +void xfs_ag_mark_healthy(struct xfs_perag *pag, unsigned int mask); +void xfs_ag_measure_sickness(struct xfs_perag *pag, unsigned int *sick, + unsigned int *checked); + +void xfs_inode_mark_sick(struct xfs_inode *ip, unsigned int mask); +void xfs_inode_mark_healthy(struct xfs_inode *ip, unsigned int mask); +void xfs_inode_measure_sickness(struct xfs_inode *ip, unsigned int *sick, + unsigned int *checked); + +void xfs_health_unmount(struct xfs_mount *mp); + +/* Now some helpers. */ + +static inline bool +xfs_fs_has_sickness(struct xfs_mount *mp, unsigned int mask) +{ + unsigned int sick, checked; + + xfs_fs_measure_sickness(mp, &sick, &checked); + return sick & mask; +} + +static inline bool +xfs_rt_has_sickness(struct xfs_mount *mp, unsigned int mask) +{ + unsigned int sick, checked; + + xfs_rt_measure_sickness(mp, &sick, &checked); + return sick & mask; +} + +static inline bool +xfs_ag_has_sickness(struct xfs_perag *pag, unsigned int mask) +{ + unsigned int sick, checked; + + xfs_ag_measure_sickness(pag, &sick, &checked); + return sick & mask; +} + +static inline bool +xfs_inode_has_sickness(struct xfs_inode *ip, unsigned int mask) +{ + unsigned int sick, checked; + + xfs_inode_measure_sickness(ip, &sick, &checked); + return sick & mask; +} + +static inline bool +xfs_fs_is_healthy(struct xfs_mount *mp) +{ + return !xfs_fs_has_sickness(mp, -1U); +} + +static inline bool +xfs_rt_is_healthy(struct xfs_mount *mp) +{ + return !xfs_rt_has_sickness(mp, -1U); +} + +static inline bool +xfs_ag_is_healthy(struct xfs_perag *pag) +{ + return !xfs_ag_has_sickness(pag, -1U); +} + +static inline bool +xfs_inode_is_healthy(struct xfs_inode *ip) +{ + return !xfs_inode_has_sickness(ip, -1U); +} + +void xfs_fsop_geom_health(struct xfs_mount *mp, struct xfs_fsop_geom *geo); +void xfs_ag_geom_health(struct xfs_perag *pag, struct xfs_ag_geometry *ageo); +void xfs_bulkstat_health(struct xfs_inode *ip, struct xfs_bstat *bs); + +#endif /* __XFS_HEALTH_H__ */ diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index 4bfdd5f4c6af..b2113b17e53c 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -142,7 +142,7 @@ extern xfs_failaddr_t xfs_dquot_verify(struct xfs_mount *mp, extern xfs_failaddr_t xfs_dqblk_verify(struct xfs_mount *mp, struct xfs_dqblk *dqb, xfs_dqid_t id, uint type); extern int xfs_calc_dquots_per_chunk(unsigned int nbblks); -extern int xfs_dqblk_repair(struct xfs_mount *mp, struct xfs_dqblk *dqb, +extern void xfs_dqblk_repair(struct xfs_mount *mp, struct xfs_dqblk *dqb, xfs_dqid_t id, uint type); #endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 77a3a4085de3..e76a3e5d28d7 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -30,6 +30,7 @@ #include "xfs_refcount_btree.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" +#include "xfs_health.h" /* * Physical superblock buffer manipulations. Shared with libxfs in userspace. @@ -905,7 +906,7 @@ xfs_initialize_perag_data( /* * If the new summary counts are obviously incorrect, fail the * mount operation because that implies the AGFs are also corrupt. - * Clear BAD_SUMMARY so that we don't unmount with a dirty log, which + * Clear FS_COUNTERS so that we don't unmount with a dirty log, which * will prevent xfs_repair from fixing anything. */ if (fdblocks > sbp->sb_dblocks || ifree > ialloc) { @@ -923,7 +924,7 @@ xfs_initialize_perag_data( xfs_reinit_percpu_counters(mp); out: - mp->m_flags &= ~XFS_MOUNT_BAD_SUMMARY; + xfs_fs_mark_healthy(mp, XFS_SICK_FS_COUNTERS); return error; } @@ -1084,7 +1085,7 @@ out: return error; } -int +void xfs_fs_geometry( struct xfs_sb *sbp, struct xfs_fsop_geom *geo, @@ -1108,13 +1109,13 @@ xfs_fs_geometry( memcpy(geo->uuid, &sbp->sb_uuid, sizeof(sbp->sb_uuid)); if (struct_version < 2) - return 0; + return; geo->sunit = sbp->sb_unit; geo->swidth = sbp->sb_width; if (struct_version < 3) - return 0; + return; geo->version = XFS_FSOP_GEOM_VERSION; geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK | @@ -1158,14 +1159,17 @@ xfs_fs_geometry( geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp); if (struct_version < 4) - return 0; + return; if (xfs_sb_version_haslogv2(sbp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_LOGV2; geo->logsunit = sbp->sb_logsunit; - return 0; + if (struct_version < 5) + return; + + geo->version = XFS_FSOP_GEOM_VERSION_V5; } /* Read a secondary superblock. */ diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h index 13564d69800a..92465a9a5162 100644 --- a/fs/xfs/libxfs/xfs_sb.h +++ b/fs/xfs/libxfs/xfs_sb.h @@ -33,7 +33,7 @@ extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp); extern int xfs_update_secondary_sbs(struct xfs_mount *mp); #define XFS_FS_GEOM_MAX_STRUCT_VER (4) -extern int xfs_fs_geometry(struct xfs_sb *sbp, struct xfs_fsop_geom *geo, +extern void xfs_fs_geometry(struct xfs_sb *sbp, struct xfs_fsop_geom *geo, int struct_version); extern int xfs_sb_read_secondary(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index f99a7aefe418..83f4ee2afc49 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -876,9 +876,13 @@ xfs_trans_resv_calc( resp->tr_sb.tr_logres = xfs_calc_sb_reservation(mp); resp->tr_sb.tr_logcount = XFS_DEFAULT_LOG_COUNT; + /* growdata requires permanent res; it can free space to the last AG */ + resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp); + resp->tr_growdata.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT; + resp->tr_growdata.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + /* The following transaction are logged in logical format */ resp->tr_ichange.tr_logres = xfs_calc_ichange_reservation(mp); - resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp); resp->tr_fsyncts.tr_logres = xfs_calc_swrite_reservation(mp); resp->tr_writeid.tr_logres = xfs_calc_writeid_reservation(mp); resp->tr_attrsetrt.tr_logres = xfs_calc_attrsetrt_reservation(mp); diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c index de310712dd6d..d51acc95bc00 100644 --- a/fs/xfs/libxfs/xfs_types.c +++ b/fs/xfs/libxfs/xfs_types.c @@ -185,7 +185,7 @@ xfs_verify_rtbno( } /* Calculate the range of valid icount values. */ -static void +void xfs_icount_range( struct xfs_mount *mp, unsigned long long *min, diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index c5a25403b4db..802b34cd10fe 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -191,5 +191,7 @@ bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno); bool xfs_verify_icount(struct xfs_mount *mp, unsigned long long icount); bool xfs_verify_dablk(struct xfs_mount *mp, xfs_fileoff_t off); +void xfs_icount_range(struct xfs_mount *mp, unsigned long long *min, + unsigned long long *max); #endif /* __XFS_TYPES_H__ */ diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index ddf06bfaa29d..adaeabdefdd3 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -514,6 +514,7 @@ xchk_agf( { struct xfs_mount *mp = sc->mp; struct xfs_agf *agf; + struct xfs_perag *pag; xfs_agnumber_t agno; xfs_agblock_t agbno; xfs_agblock_t eoag; @@ -586,6 +587,16 @@ xchk_agf( if (agfl_count != 0 && fl_count != agfl_count) xchk_block_set_corrupt(sc, sc->sa.agf_bp); + /* Do the incore counters match? */ + pag = xfs_perag_get(mp, agno); + if (pag->pagf_freeblks != be32_to_cpu(agf->agf_freeblks)) + xchk_block_set_corrupt(sc, sc->sa.agf_bp); + if (pag->pagf_flcount != be32_to_cpu(agf->agf_flcount)) + xchk_block_set_corrupt(sc, sc->sa.agf_bp); + if (pag->pagf_btreeblks != be32_to_cpu(agf->agf_btreeblks)) + xchk_block_set_corrupt(sc, sc->sa.agf_bp); + xfs_perag_put(pag); + xchk_agf_xref(sc); out: return error; @@ -811,6 +822,7 @@ xchk_agi( { struct xfs_mount *mp = sc->mp; struct xfs_agi *agi; + struct xfs_perag *pag; xfs_agnumber_t agno; xfs_agblock_t agbno; xfs_agblock_t eoag; @@ -881,6 +893,14 @@ xchk_agi( if (agi->agi_pad32 != cpu_to_be32(0)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); + /* Do the incore counters match? */ + pag = xfs_perag_get(mp, agno); + if (pag->pagi_count != be32_to_cpu(agi->agi_count)) + xchk_block_set_corrupt(sc, sc->sa.agi_bp); + if (pag->pagi_freecount != be32_to_cpu(agi->agi_freecount)) + xchk_block_set_corrupt(sc, sc->sa.agi_bp); + xfs_perag_put(pag); + xchk_agi_xref(sc); out: return error; diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 0c54ff55b901..973aa59975e3 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -38,6 +38,7 @@ #include "scrub/trace.h" #include "scrub/btree.h" #include "scrub/repair.h" +#include "scrub/health.h" /* Common code for the metadata scrubbers. */ @@ -208,6 +209,15 @@ xchk_ino_set_preen( trace_xchk_ino_preen(sc, ino, __return_address); } +/* Record something being wrong with the filesystem primary superblock. */ +void +xchk_set_corrupt( + struct xfs_scrub *sc) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + trace_xchk_fs_error(sc, 0, __return_address); +} + /* Record a corrupt block. */ void xchk_block_set_corrupt( @@ -458,13 +468,18 @@ xchk_ag_btcur_init( struct xfs_mount *mp = sc->mp; xfs_agnumber_t agno = sa->agno; - if (sa->agf_bp) { + xchk_perag_get(sc->mp, sa); + if (sa->agf_bp && + xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_BNO)) { /* Set up a bnobt cursor for cross-referencing. */ sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, agno, XFS_BTNUM_BNO); if (!sa->bno_cur) goto err; + } + if (sa->agf_bp && + xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_CNT)) { /* Set up a cntbt cursor for cross-referencing. */ sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, agno, XFS_BTNUM_CNT); @@ -473,7 +488,8 @@ xchk_ag_btcur_init( } /* Set up a inobt cursor for cross-referencing. */ - if (sa->agi_bp) { + if (sa->agi_bp && + xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_INO)) { sa->ino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp, agno, XFS_BTNUM_INO); if (!sa->ino_cur) @@ -481,7 +497,8 @@ xchk_ag_btcur_init( } /* Set up a finobt cursor for cross-referencing. */ - if (sa->agi_bp && xfs_sb_version_hasfinobt(&mp->m_sb)) { + if (sa->agi_bp && xfs_sb_version_hasfinobt(&mp->m_sb) && + xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_FINO)) { sa->fino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp, agno, XFS_BTNUM_FINO); if (!sa->fino_cur) @@ -489,7 +506,8 @@ xchk_ag_btcur_init( } /* Set up a rmapbt cursor for cross-referencing. */ - if (sa->agf_bp && xfs_sb_version_hasrmapbt(&mp->m_sb)) { + if (sa->agf_bp && xfs_sb_version_hasrmapbt(&mp->m_sb) && + xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_RMAP)) { sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp, agno); if (!sa->rmap_cur) @@ -497,7 +515,8 @@ xchk_ag_btcur_init( } /* Set up a refcountbt cursor for cross-referencing. */ - if (sa->agf_bp && xfs_sb_version_hasreflink(&mp->m_sb)) { + if (sa->agf_bp && xfs_sb_version_hasreflink(&mp->m_sb) && + xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_REFC)) { sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp, sa->agf_bp, agno); if (!sa->refc_cur) @@ -884,3 +903,21 @@ xchk_ilock_inverted( } return -EDEADLOCK; } + +/* Pause background reaping of resources. */ +void +xchk_stop_reaping( + struct xfs_scrub *sc) +{ + sc->flags |= XCHK_REAPING_DISABLED; + xfs_stop_block_reaping(sc->mp); +} + +/* Restart background reaping of resources. */ +void +xchk_start_reaping( + struct xfs_scrub *sc) +{ + xfs_start_block_reaping(sc->mp); + sc->flags &= ~XCHK_REAPING_DISABLED; +} diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index e26a430bd466..003a772cd26c 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -39,6 +39,7 @@ void xchk_block_set_preen(struct xfs_scrub *sc, struct xfs_buf *bp); void xchk_ino_set_preen(struct xfs_scrub *sc, xfs_ino_t ino); +void xchk_set_corrupt(struct xfs_scrub *sc); void xchk_block_set_corrupt(struct xfs_scrub *sc, struct xfs_buf *bp); void xchk_ino_set_corrupt(struct xfs_scrub *sc, xfs_ino_t ino); @@ -105,6 +106,7 @@ xchk_setup_quota(struct xfs_scrub *sc, struct xfs_inode *ip) return -ENOENT; } #endif +int xchk_setup_fscounters(struct xfs_scrub *sc, struct xfs_inode *ip); void xchk_ag_free(struct xfs_scrub *sc, struct xchk_ag *sa); int xchk_ag_init(struct xfs_scrub *sc, xfs_agnumber_t agno, @@ -137,5 +139,7 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm) int xchk_metadata_inode_forks(struct xfs_scrub *sc); int xchk_ilock_inverted(struct xfs_inode *ip, uint lock_mode); +void xchk_stop_reaping(struct xfs_scrub *sc); +void xchk_start_reaping(struct xfs_scrub *sc); #endif /* __XFS_SCRUB_COMMON_H__ */ diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c new file mode 100644 index 000000000000..07c11e3e6437 --- /dev/null +++ b/fs/xfs/scrub/fscounters.c @@ -0,0 +1,366 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2019 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_rmap.h" +#include "xfs_error.h" +#include "xfs_errortag.h" +#include "xfs_icache.h" +#include "xfs_health.h" +#include "xfs_bmap.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" + +/* + * FS Summary Counters + * =================== + * + * The basics of filesystem summary counter checking are that we iterate the + * AGs counting the number of free blocks, free space btree blocks, per-AG + * reservations, inodes, delayed allocation reservations, and free inodes. + * Then we compare what we computed against the in-core counters. + * + * However, the reality is that summary counters are a tricky beast to check. + * While we /could/ freeze the filesystem and scramble around the AGs counting + * the free blocks, in practice we prefer not do that for a scan because + * freezing is costly. To get around this, we added a per-cpu counter of the + * delalloc reservations so that we can rotor around the AGs relatively + * quickly, and we allow the counts to be slightly off because we're not taking + * any locks while we do this. + * + * So the first thing we do is warm up the buffer cache in the setup routine by + * walking all the AGs to make sure the incore per-AG structure has been + * initialized. The expected value calculation then iterates the incore per-AG + * structures as quickly as it can. We snapshot the percpu counters before and + * after this operation and use the difference in counter values to guess at + * our tolerance for mismatch between expected and actual counter values. + */ + +/* + * Since the expected value computation is lockless but only browses incore + * values, the percpu counters should be fairly close to each other. However, + * we'll allow ourselves to be off by at least this (arbitrary) amount. + */ +#define XCHK_FSCOUNT_MIN_VARIANCE (512) + +/* + * Make sure the per-AG structure has been initialized from the on-disk header + * contents and trust that the incore counters match the ondisk counters. (The + * AGF and AGI scrubbers check them, and a normal xfs_scrub run checks the + * summary counters after checking all AG headers). Do this from the setup + * function so that the inner AG aggregation loop runs as quickly as possible. + * + * This function runs during the setup phase /before/ we start checking any + * metadata. + */ +STATIC int +xchk_fscount_warmup( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_buf *agi_bp = NULL; + struct xfs_buf *agf_bp = NULL; + struct xfs_perag *pag = NULL; + xfs_agnumber_t agno; + int error = 0; + + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + pag = xfs_perag_get(mp, agno); + + if (pag->pagi_init && pag->pagf_init) + goto next_loop_perag; + + /* Lock both AG headers. */ + error = xfs_ialloc_read_agi(mp, sc->tp, agno, &agi_bp); + if (error) + break; + error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &agf_bp); + if (error) + break; + error = -ENOMEM; + if (!agf_bp || !agi_bp) + break; + + /* + * These are supposed to be initialized by the header read + * function. + */ + error = -EFSCORRUPTED; + if (!pag->pagi_init || !pag->pagf_init) + break; + + xfs_buf_relse(agf_bp); + agf_bp = NULL; + xfs_buf_relse(agi_bp); + agi_bp = NULL; +next_loop_perag: + xfs_perag_put(pag); + pag = NULL; + error = 0; + + if (fatal_signal_pending(current)) + break; + } + + if (agf_bp) + xfs_buf_relse(agf_bp); + if (agi_bp) + xfs_buf_relse(agi_bp); + if (pag) + xfs_perag_put(pag); + return error; +} + +int +xchk_setup_fscounters( + struct xfs_scrub *sc, + struct xfs_inode *ip) +{ + struct xchk_fscounters *fsc; + int error; + + sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), KM_SLEEP); + if (!sc->buf) + return -ENOMEM; + fsc = sc->buf; + + xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max); + + /* We must get the incore counters set up before we can proceed. */ + error = xchk_fscount_warmup(sc); + if (error) + return error; + + /* + * Pause background reclaim while we're scrubbing to reduce the + * likelihood of background perturbations to the counters throwing off + * our calculations. + */ + xchk_stop_reaping(sc); + + return xchk_trans_alloc(sc, 0); +} + +/* + * Calculate what the global in-core counters ought to be from the incore + * per-AG structure. Callers can compare this to the actual in-core counters + * to estimate by how much both in-core and on-disk counters need to be + * adjusted. + */ +STATIC int +xchk_fscount_aggregate_agcounts( + struct xfs_scrub *sc, + struct xchk_fscounters *fsc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_perag *pag; + uint64_t delayed; + xfs_agnumber_t agno; + int tries = 8; + +retry: + fsc->icount = 0; + fsc->ifree = 0; + fsc->fdblocks = 0; + + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + pag = xfs_perag_get(mp, agno); + + /* This somehow got unset since the warmup? */ + if (!pag->pagi_init || !pag->pagf_init) { + xfs_perag_put(pag); + return -EFSCORRUPTED; + } + + /* Count all the inodes */ + fsc->icount += pag->pagi_count; + fsc->ifree += pag->pagi_freecount; + + /* Add up the free/freelist/bnobt/cntbt blocks */ + fsc->fdblocks += pag->pagf_freeblks; + fsc->fdblocks += pag->pagf_flcount; + fsc->fdblocks += pag->pagf_btreeblks; + + /* + * Per-AG reservations are taken out of the incore counters, + * so they must be left out of the free blocks computation. + */ + fsc->fdblocks -= pag->pag_meta_resv.ar_reserved; + fsc->fdblocks -= pag->pag_rmapbt_resv.ar_orig_reserved; + + xfs_perag_put(pag); + + if (fatal_signal_pending(current)) + break; + } + + /* + * The global incore space reservation is taken from the incore + * counters, so leave that out of the computation. + */ + fsc->fdblocks -= mp->m_resblks_avail; + + /* + * Delayed allocation reservations are taken out of the incore counters + * but not recorded on disk, so leave them and their indlen blocks out + * of the computation. + */ + delayed = percpu_counter_sum(&mp->m_delalloc_blks); + fsc->fdblocks -= delayed; + + trace_xchk_fscounters_calc(mp, fsc->icount, fsc->ifree, fsc->fdblocks, + delayed); + + + /* Bail out if the values we compute are totally nonsense. */ + if (fsc->icount < fsc->icount_min || fsc->icount > fsc->icount_max || + fsc->fdblocks > mp->m_sb.sb_dblocks || + fsc->ifree > fsc->icount_max) + return -EFSCORRUPTED; + + /* + * If ifree > icount then we probably had some perturbation in the + * counters while we were calculating things. We'll try a few times + * to maintain ifree <= icount before giving up. + */ + if (fsc->ifree > fsc->icount) { + if (tries--) + goto retry; + xchk_set_incomplete(sc); + return 0; + } + + return 0; +} + +/* + * Is the @counter reasonably close to the @expected value? + * + * We neither locked nor froze anything in the filesystem while aggregating the + * per-AG data to compute the @expected value, which means that the counter + * could have changed. We know the @old_value of the summation of the counter + * before the aggregation, and we re-sum the counter now. If the expected + * value falls between the two summations, we're ok. + * + * Otherwise, we /might/ have a problem. If the change in the summations is + * more than we want to tolerate, the filesystem is probably busy and we should + * just send back INCOMPLETE and see if userspace will try again. + */ +static inline bool +xchk_fscount_within_range( + struct xfs_scrub *sc, + const int64_t old_value, + struct percpu_counter *counter, + uint64_t expected) +{ + int64_t min_value, max_value; + int64_t curr_value = percpu_counter_sum(counter); + + trace_xchk_fscounters_within_range(sc->mp, expected, curr_value, + old_value); + + /* Negative values are always wrong. */ + if (curr_value < 0) + return false; + + /* Exact matches are always ok. */ + if (curr_value == expected) + return true; + + min_value = min(old_value, curr_value); + max_value = max(old_value, curr_value); + + /* Within the before-and-after range is ok. */ + if (expected >= min_value && expected <= max_value) + return true; + + /* + * If the difference between the two summations is too large, the fs + * might just be busy and so we'll mark the scrub incomplete. Return + * true here so that we don't mark the counter corrupt. + * + * XXX: In the future when userspace can grant scrub permission to + * quiesce the filesystem to solve the outsized variance problem, this + * check should be moved up and the return code changed to signal to + * userspace that we need quiesce permission. + */ + if (max_value - min_value >= XCHK_FSCOUNT_MIN_VARIANCE) { + xchk_set_incomplete(sc); + return true; + } + + return false; +} + +/* Check the superblock counters. */ +int +xchk_fscounters( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xchk_fscounters *fsc = sc->buf; + int64_t icount, ifree, fdblocks; + int error; + + /* Snapshot the percpu counters. */ + icount = percpu_counter_sum(&mp->m_icount); + ifree = percpu_counter_sum(&mp->m_ifree); + fdblocks = percpu_counter_sum(&mp->m_fdblocks); + + /* No negative values, please! */ + if (icount < 0 || ifree < 0 || fdblocks < 0) + xchk_set_corrupt(sc); + + /* See if icount is obviously wrong. */ + if (icount < fsc->icount_min || icount > fsc->icount_max) + xchk_set_corrupt(sc); + + /* See if fdblocks is obviously wrong. */ + if (fdblocks > mp->m_sb.sb_dblocks) + xchk_set_corrupt(sc); + + /* + * If ifree exceeds icount by more than the minimum variance then + * something's probably wrong with the counters. + */ + if (ifree > icount && ifree - icount > XCHK_FSCOUNT_MIN_VARIANCE) + xchk_set_corrupt(sc); + + /* Walk the incore AG headers to calculate the expected counters. */ + error = xchk_fscount_aggregate_agcounts(sc, fsc); + if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error)) + return error; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) + return 0; + + /* Compare the in-core counters with whatever we counted. */ + if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, fsc->icount)) + xchk_set_corrupt(sc); + + if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) + xchk_set_corrupt(sc); + + if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks, + fsc->fdblocks)) + xchk_set_corrupt(sc); + + return 0; +} diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c new file mode 100644 index 000000000000..23cf8e2f25db --- /dev/null +++ b/fs/xfs/scrub/health.c @@ -0,0 +1,237 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2019 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_health.h" +#include "scrub/scrub.h" +#include "scrub/health.h" + +/* + * Scrub and In-Core Filesystem Health Assessments + * =============================================== + * + * Online scrub and repair have the time and the ability to perform stronger + * checks than we can do from the metadata verifiers, because they can + * cross-reference records between data structures. Therefore, scrub is in a + * good position to update the online filesystem health assessments to reflect + * the good/bad state of the data structure. + * + * We therefore extend scrub in the following ways to achieve this: + * + * 1. Create a "sick_mask" field in the scrub context. When we're setting up a + * scrub call, set this to the default XFS_SICK_* flag(s) for the selected + * scrub type (call it A). Scrub and repair functions can override the default + * sick_mask value if they choose. + * + * 2. If the scrubber returns a runtime error code, we exit making no changes + * to the incore sick state. + * + * 3. If the scrubber finds that A is clean, use sick_mask to clear the incore + * sick flags before exiting. + * + * 4. If the scrubber finds that A is corrupt, use sick_mask to set the incore + * sick flags. If the user didn't want to repair then we exit, leaving the + * metadata structure unfixed and the sick flag set. + * + * 5. Now we know that A is corrupt and the user wants to repair, so run the + * repairer. If the repairer returns an error code, we exit with that error + * code, having made no further changes to the incore sick state. + * + * 6. If repair rebuilds A correctly and the subsequent re-scrub of A is clean, + * use sick_mask to clear the incore sick flags. This should have the effect + * that A is no longer marked sick. + * + * 7. If repair rebuilds A incorrectly, the re-scrub will find it corrupt and + * use sick_mask to set the incore sick flags. This should have no externally + * visible effect since we already set them in step (4). + * + * There are some complications to this story, however. For certain types of + * complementary metadata indices (e.g. inobt/finobt), it is easier to rebuild + * both structures at the same time. The following principles apply to this + * type of repair strategy: + * + * 8. Any repair function that rebuilds multiple structures should update + * sick_mask_visible to reflect whatever other structures are rebuilt, and + * verify that all the rebuilt structures can pass a scrub check. The outcomes + * of 5-7 still apply, but with a sick_mask that covers everything being + * rebuilt. + */ + +/* Map our scrub type to a sick mask and a set of health update functions. */ + +enum xchk_health_group { + XHG_FS = 1, + XHG_RT, + XHG_AG, + XHG_INO, +}; + +struct xchk_health_map { + enum xchk_health_group group; + unsigned int sick_mask; +}; + +static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = { + [XFS_SCRUB_TYPE_SB] = { XHG_AG, XFS_SICK_AG_SB }, + [XFS_SCRUB_TYPE_AGF] = { XHG_AG, XFS_SICK_AG_AGF }, + [XFS_SCRUB_TYPE_AGFL] = { XHG_AG, XFS_SICK_AG_AGFL }, + [XFS_SCRUB_TYPE_AGI] = { XHG_AG, XFS_SICK_AG_AGI }, + [XFS_SCRUB_TYPE_BNOBT] = { XHG_AG, XFS_SICK_AG_BNOBT }, + [XFS_SCRUB_TYPE_CNTBT] = { XHG_AG, XFS_SICK_AG_CNTBT }, + [XFS_SCRUB_TYPE_INOBT] = { XHG_AG, XFS_SICK_AG_INOBT }, + [XFS_SCRUB_TYPE_FINOBT] = { XHG_AG, XFS_SICK_AG_FINOBT }, + [XFS_SCRUB_TYPE_RMAPBT] = { XHG_AG, XFS_SICK_AG_RMAPBT }, + [XFS_SCRUB_TYPE_REFCNTBT] = { XHG_AG, XFS_SICK_AG_REFCNTBT }, + [XFS_SCRUB_TYPE_INODE] = { XHG_INO, XFS_SICK_INO_CORE }, + [XFS_SCRUB_TYPE_BMBTD] = { XHG_INO, XFS_SICK_INO_BMBTD }, + [XFS_SCRUB_TYPE_BMBTA] = { XHG_INO, XFS_SICK_INO_BMBTA }, + [XFS_SCRUB_TYPE_BMBTC] = { XHG_INO, XFS_SICK_INO_BMBTC }, + [XFS_SCRUB_TYPE_DIR] = { XHG_INO, XFS_SICK_INO_DIR }, + [XFS_SCRUB_TYPE_XATTR] = { XHG_INO, XFS_SICK_INO_XATTR }, + [XFS_SCRUB_TYPE_SYMLINK] = { XHG_INO, XFS_SICK_INO_SYMLINK }, + [XFS_SCRUB_TYPE_PARENT] = { XHG_INO, XFS_SICK_INO_PARENT }, + [XFS_SCRUB_TYPE_RTBITMAP] = { XHG_RT, XFS_SICK_RT_BITMAP }, + [XFS_SCRUB_TYPE_RTSUM] = { XHG_RT, XFS_SICK_RT_SUMMARY }, + [XFS_SCRUB_TYPE_UQUOTA] = { XHG_FS, XFS_SICK_FS_UQUOTA }, + [XFS_SCRUB_TYPE_GQUOTA] = { XHG_FS, XFS_SICK_FS_GQUOTA }, + [XFS_SCRUB_TYPE_PQUOTA] = { XHG_FS, XFS_SICK_FS_PQUOTA }, + [XFS_SCRUB_TYPE_FSCOUNTERS] = { XHG_FS, XFS_SICK_FS_COUNTERS }, +}; + +/* Return the health status mask for this scrub type. */ +unsigned int +xchk_health_mask_for_scrub_type( + __u32 scrub_type) +{ + return type_to_health_flag[scrub_type].sick_mask; +} + +/* + * Update filesystem health assessments based on what we found and did. + * + * If the scrubber finds errors, we mark sick whatever's mentioned in + * sick_mask, no matter whether this is a first scan or an + * evaluation of repair effectiveness. + * + * Otherwise, no direct corruption was found, so mark whatever's in + * sick_mask as healthy. + */ +void +xchk_update_health( + struct xfs_scrub *sc) +{ + struct xfs_perag *pag; + bool bad; + + if (!sc->sick_mask) + return; + + bad = (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT); + switch (type_to_health_flag[sc->sm->sm_type].group) { + case XHG_AG: + pag = xfs_perag_get(sc->mp, sc->sm->sm_agno); + if (bad) + xfs_ag_mark_sick(pag, sc->sick_mask); + else + xfs_ag_mark_healthy(pag, sc->sick_mask); + xfs_perag_put(pag); + break; + case XHG_INO: + if (!sc->ip) + return; + if (bad) + xfs_inode_mark_sick(sc->ip, sc->sick_mask); + else + xfs_inode_mark_healthy(sc->ip, sc->sick_mask); + break; + case XHG_FS: + if (bad) + xfs_fs_mark_sick(sc->mp, sc->sick_mask); + else + xfs_fs_mark_healthy(sc->mp, sc->sick_mask); + break; + case XHG_RT: + if (bad) + xfs_rt_mark_sick(sc->mp, sc->sick_mask); + else + xfs_rt_mark_healthy(sc->mp, sc->sick_mask); + break; + default: + ASSERT(0); + break; + } +} + +/* Is the given per-AG btree healthy enough for scanning? */ +bool +xchk_ag_btree_healthy_enough( + struct xfs_scrub *sc, + struct xfs_perag *pag, + xfs_btnum_t btnum) +{ + unsigned int mask = 0; + + /* + * We always want the cursor if it's the same type as whatever we're + * scrubbing, even if we already know the structure is corrupt. + * + * Otherwise, we're only interested in the btree for cross-referencing. + * If we know the btree is bad then don't bother, just set XFAIL. + */ + switch (btnum) { + case XFS_BTNUM_BNO: + if (sc->sm->sm_type == XFS_SCRUB_TYPE_BNOBT) + return true; + mask = XFS_SICK_AG_BNOBT; + break; + case XFS_BTNUM_CNT: + if (sc->sm->sm_type == XFS_SCRUB_TYPE_CNTBT) + return true; + mask = XFS_SICK_AG_CNTBT; + break; + case XFS_BTNUM_INO: + if (sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT) + return true; + mask = XFS_SICK_AG_INOBT; + break; + case XFS_BTNUM_FINO: + if (sc->sm->sm_type == XFS_SCRUB_TYPE_FINOBT) + return true; + mask = XFS_SICK_AG_FINOBT; + break; + case XFS_BTNUM_RMAP: + if (sc->sm->sm_type == XFS_SCRUB_TYPE_RMAPBT) + return true; + mask = XFS_SICK_AG_RMAPBT; + break; + case XFS_BTNUM_REFC: + if (sc->sm->sm_type == XFS_SCRUB_TYPE_REFCNTBT) + return true; + mask = XFS_SICK_AG_REFCNTBT; + break; + default: + ASSERT(0); + return true; + } + + if (xfs_ag_has_sickness(pag, mask)) { + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL; + return false; + } + + return true; +} diff --git a/fs/xfs/scrub/health.h b/fs/xfs/scrub/health.h new file mode 100644 index 000000000000..d0b938d3d028 --- /dev/null +++ b/fs/xfs/scrub/health.h @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2019 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#ifndef __XFS_SCRUB_HEALTH_H__ +#define __XFS_SCRUB_HEALTH_H__ + +unsigned int xchk_health_mask_for_scrub_type(__u32 scrub_type); +void xchk_update_health(struct xfs_scrub *sc); +bool xchk_ag_btree_healthy_enough(struct xfs_scrub *sc, struct xfs_perag *pag, + xfs_btnum_t btnum); + +#endif /* __XFS_SCRUB_HEALTH_H__ */ diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 700114f79a7d..693eb51f5efb 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -39,7 +39,7 @@ xchk_setup_ag_iallocbt( struct xfs_scrub *sc, struct xfs_inode *ip) { - return xchk_setup_ag_btree(sc, ip, sc->try_harder); + return xchk_setup_ag_btree(sc, ip, sc->flags & XCHK_TRY_HARDER); } /* Inode btree scrubber. */ @@ -185,7 +185,7 @@ xchk_iallocbt_check_cluster_ifree( if (error == -ENODATA) { /* Not cached, just read the disk buffer */ freemask_ok = irec_free ^ !!(dip->di_mode); - if (!bs->sc->try_harder && !freemask_ok) + if (!(bs->sc->flags & XCHK_TRY_HARDER) && !freemask_ok) return -EDEADLOCK; } else if (error < 0) { /* diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index 1c9d7c7f64f5..d5d197f1b80f 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -320,7 +320,7 @@ out: * If we failed to lock the parent inode even after a retry, just mark * this scrub incomplete and return. */ - if (sc->try_harder && error == -EDEADLOCK) { + if ((sc->flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) { error = 0; xchk_set_incomplete(sc); } diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 782d582d3edd..5dfe2b5924db 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -60,7 +60,7 @@ xchk_setup_quota( dqtype = xchk_quota_to_dqtype(sc); if (dqtype == 0) return -EINVAL; - sc->has_quotaofflock = true; + sc->flags |= XCHK_HAS_QUOTAOFFLOCK; mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock); if (!xfs_this_quota_on(sc->mp, dqtype)) return -ENOENT; diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index f28f4bad317b..eb358f0f5e0a 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -46,8 +46,7 @@ int xrep_attempt( struct xfs_inode *ip, - struct xfs_scrub *sc, - bool *fixed) + struct xfs_scrub *sc) { int error = 0; @@ -66,13 +65,13 @@ xrep_attempt( * scrub so that we can tell userspace if we fixed the problem. */ sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; - *fixed = true; + sc->flags |= XREP_ALREADY_FIXED; return -EAGAIN; case -EDEADLOCK: case -EAGAIN: /* Tell the caller to try again having grabbed all the locks. */ - if (!sc->try_harder) { - sc->try_harder = true; + if (!(sc->flags & XCHK_TRY_HARDER)) { + sc->flags |= XCHK_TRY_HARDER; return -EAGAIN; } /* @@ -137,10 +136,16 @@ xrep_roll_ag_trans( if (sc->sa.agfl_bp) xfs_trans_bhold(sc->tp, sc->sa.agfl_bp); - /* Roll the transaction. */ + /* + * Roll the transaction. We still own the buffer and the buffer lock + * regardless of whether or not the roll succeeds. If the roll fails, + * the buffers will be released during teardown on our way out of the + * kernel. If it succeeds, we join them to the new transaction and + * move on. + */ error = xfs_trans_roll(&sc->tp); if (error) - goto out_release; + return error; /* Join AG headers to the new transaction. */ if (sc->sa.agi_bp) @@ -151,21 +156,6 @@ xrep_roll_ag_trans( xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp); return 0; - -out_release: - /* - * Rolling failed, so release the hold on the buffers. The - * buffers will be released during teardown on our way out - * of the kernel. - */ - if (sc->sa.agi_bp) - xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp); - if (sc->sa.agf_bp) - xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp); - if (sc->sa.agfl_bp) - xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp); - - return error; } /* diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index d990314eb08b..60c61d7052a8 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -15,7 +15,7 @@ static inline int xrep_notsupported(struct xfs_scrub *sc) /* Repair helpers */ -int xrep_attempt(struct xfs_inode *ip, struct xfs_scrub *sc, bool *fixed); +int xrep_attempt(struct xfs_inode *ip, struct xfs_scrub *sc); void xrep_failure(struct xfs_mount *mp); int xrep_roll_ag_trans(struct xfs_scrub *sc); bool xrep_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks, @@ -64,8 +64,7 @@ int xrep_agi(struct xfs_scrub *sc); static inline int xrep_attempt( struct xfs_inode *ip, - struct xfs_scrub *sc, - bool *fixed) + struct xfs_scrub *sc) { return -EOPNOTSUPP; } diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 1b2344d00525..f630389ee176 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -40,6 +40,7 @@ #include "scrub/trace.h" #include "scrub/btree.h" #include "scrub/repair.h" +#include "scrub/health.h" /* * Online Scrub and Repair @@ -186,8 +187,12 @@ xchk_teardown( xfs_irele(sc->ip); sc->ip = NULL; } - if (sc->has_quotaofflock) + if (sc->flags & XCHK_REAPING_DISABLED) + xchk_start_reaping(sc); + if (sc->flags & XCHK_HAS_QUOTAOFFLOCK) { mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock); + sc->flags &= ~XCHK_HAS_QUOTAOFFLOCK; + } if (sc->buf) { kmem_free(sc->buf); sc->buf = NULL; @@ -347,6 +352,12 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .scrub = xchk_quota, .repair = xrep_notsupported, }, + [XFS_SCRUB_TYPE_FSCOUNTERS] = { /* fs summary counters */ + .type = ST_FS, + .setup = xchk_setup_fscounters, + .scrub = xchk_fscounters, + .repair = xrep_notsupported, + }, }; /* This isn't a stable feature, warn once per day. */ @@ -466,10 +477,14 @@ xfs_scrub_metadata( struct xfs_inode *ip, struct xfs_scrub_metadata *sm) { - struct xfs_scrub sc; + struct xfs_scrub sc = { + .mp = ip->i_mount, + .sm = sm, + .sa = { + .agno = NULLAGNUMBER, + }, + }; struct xfs_mount *mp = ip->i_mount; - bool try_harder = false; - bool already_fixed = false; int error = 0; BUILD_BUG_ON(sizeof(meta_scrub_ops) != @@ -491,21 +506,17 @@ xfs_scrub_metadata( xchk_experimental_warning(mp); + sc.ops = &meta_scrub_ops[sm->sm_type]; + sc.sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type); retry_op: /* Set up for the operation. */ - memset(&sc, 0, sizeof(sc)); - sc.mp = ip->i_mount; - sc.sm = sm; - sc.ops = &meta_scrub_ops[sm->sm_type]; - sc.try_harder = try_harder; - sc.sa.agno = NULLAGNUMBER; error = sc.ops->setup(&sc, ip); if (error) goto out_teardown; /* Scrub for errors. */ error = sc.ops->scrub(&sc); - if (!try_harder && error == -EDEADLOCK) { + if (!(sc.flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) { /* * Scrubbers return -EDEADLOCK to mean 'try harder'. * Tear down everything we hold, then set up again with @@ -514,12 +525,15 @@ retry_op: error = xchk_teardown(&sc, ip, 0); if (error) goto out; - try_harder = true; + sc.flags |= XCHK_TRY_HARDER; goto retry_op; } else if (error) goto out_teardown; - if ((sc.sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && !already_fixed) { + xchk_update_health(&sc); + + if ((sc.sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && + !(sc.flags & XREP_ALREADY_FIXED)) { bool needs_fix; /* Let debug users force us into the repair routines. */ @@ -542,10 +556,13 @@ retry_op: * If it's broken, userspace wants us to fix it, and we haven't * already tried to fix it, then attempt a repair. */ - error = xrep_attempt(ip, &sc, &already_fixed); + error = xrep_attempt(ip, &sc); if (error == -EAGAIN) { - if (sc.try_harder) - try_harder = true; + /* + * Either the repair function succeeded or it couldn't + * get all the resources it needs; either way, we go + * back to the beginning and call the scrub function. + */ error = xchk_teardown(&sc, ip, 0); if (error) { xrep_failure(mp); diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 22f754fba8e5..ad1ceb44a628 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -62,13 +62,27 @@ struct xfs_scrub { struct xfs_inode *ip; void *buf; uint ilock_flags; - bool try_harder; - bool has_quotaofflock; + + /* See the XCHK/XREP state flags below. */ + unsigned int flags; + + /* + * The XFS_SICK_* flags that correspond to the metadata being scrubbed + * or repaired. We will use this mask to update the in-core fs health + * status with whatever we find. + */ + unsigned int sick_mask; /* State tracking for single-AG operations. */ struct xchk_ag sa; }; +/* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */ +#define XCHK_TRY_HARDER (1 << 0) /* can't get resources, try again */ +#define XCHK_HAS_QUOTAOFFLOCK (1 << 1) /* we hold the quotaoff lock */ +#define XCHK_REAPING_DISABLED (1 << 2) /* background block reaping paused */ +#define XREP_ALREADY_FIXED (1 << 31) /* checking our repair work */ + /* Metadata scrubbers */ int xchk_tester(struct xfs_scrub *sc); int xchk_superblock(struct xfs_scrub *sc); @@ -113,6 +127,7 @@ xchk_quota(struct xfs_scrub *sc) return -ENOENT; } #endif +int xchk_fscounters(struct xfs_scrub *sc); /* cross-referencing helpers */ void xchk_xref_is_used_space(struct xfs_scrub *sc, xfs_agblock_t agbno, @@ -138,4 +153,12 @@ void xchk_xref_is_used_rt_space(struct xfs_scrub *sc, xfs_rtblock_t rtbno, # define xchk_xref_is_used_rt_space(sc, rtbno, len) do { } while (0) #endif +struct xchk_fscounters { + uint64_t icount; + uint64_t ifree; + uint64_t fdblocks; + unsigned long long icount_min; + unsigned long long icount_max; +}; + #endif /* __XFS_SCRUB_SCRUB_H__ */ diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 3c83e8b3b39c..3362bae28b46 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -50,6 +50,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTSUM); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_UQUOTA); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_GQUOTA); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PQUOTA); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); #define XFS_SCRUB_TYPE_STRINGS \ { XFS_SCRUB_TYPE_PROBE, "probe" }, \ @@ -75,7 +76,8 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PQUOTA); { XFS_SCRUB_TYPE_RTSUM, "rtsummary" }, \ { XFS_SCRUB_TYPE_UQUOTA, "usrquota" }, \ { XFS_SCRUB_TYPE_GQUOTA, "grpquota" }, \ - { XFS_SCRUB_TYPE_PQUOTA, "prjquota" } + { XFS_SCRUB_TYPE_PQUOTA, "prjquota" }, \ + { XFS_SCRUB_TYPE_FSCOUNTERS, "fscounters" } DECLARE_EVENT_CLASS(xchk_class, TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm, @@ -223,6 +225,7 @@ DEFINE_EVENT(xchk_block_error_class, name, \ void *ret_ip), \ TP_ARGS(sc, daddr, ret_ip)) +DEFINE_SCRUB_BLOCK_ERROR_EVENT(xchk_fs_error); DEFINE_SCRUB_BLOCK_ERROR_EVENT(xchk_block_error); DEFINE_SCRUB_BLOCK_ERROR_EVENT(xchk_block_preen); @@ -590,6 +593,64 @@ TRACE_EVENT(xchk_iallocbt_check_cluster, __entry->cluster_ino) ) +TRACE_EVENT(xchk_fscounters_calc, + TP_PROTO(struct xfs_mount *mp, uint64_t icount, uint64_t ifree, + uint64_t fdblocks, uint64_t delalloc), + TP_ARGS(mp, icount, ifree, fdblocks, delalloc), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int64_t, icount_sb) + __field(uint64_t, icount_calculated) + __field(int64_t, ifree_sb) + __field(uint64_t, ifree_calculated) + __field(int64_t, fdblocks_sb) + __field(uint64_t, fdblocks_calculated) + __field(uint64_t, delalloc) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->icount_sb = mp->m_sb.sb_icount; + __entry->icount_calculated = icount; + __entry->ifree_sb = mp->m_sb.sb_ifree; + __entry->ifree_calculated = ifree; + __entry->fdblocks_sb = mp->m_sb.sb_fdblocks; + __entry->fdblocks_calculated = fdblocks; + __entry->delalloc = delalloc; + ), + TP_printk("dev %d:%d icount %lld:%llu ifree %lld::%llu fdblocks %lld::%llu delalloc %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->icount_sb, + __entry->icount_calculated, + __entry->ifree_sb, + __entry->ifree_calculated, + __entry->fdblocks_sb, + __entry->fdblocks_calculated, + __entry->delalloc) +) + +TRACE_EVENT(xchk_fscounters_within_range, + TP_PROTO(struct xfs_mount *mp, uint64_t expected, int64_t curr_value, + int64_t old_value), + TP_ARGS(mp, expected, curr_value, old_value), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(uint64_t, expected) + __field(int64_t, curr_value) + __field(int64_t, old_value) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->expected = expected; + __entry->curr_value = curr_value; + __entry->old_value = old_value; + ), + TP_printk("dev %d:%d expected %llu curr_value %lld old_value %lld", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->expected, + __entry->curr_value, + __entry->old_value) +) + /* repair tracepoints */ #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 3619e9e8d359..a6f0f4761a37 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -98,7 +98,6 @@ xfs_destroy_ioend( for (bio = &ioend->io_inline_bio; bio; bio = next) { struct bio_vec *bvec; - int i; struct bvec_iter_all iter_all; /* @@ -111,7 +110,7 @@ xfs_destroy_ioend( next = bio->bi_private; /* walk each page on bio, ending page IO on them */ - bio_for_each_segment_all(bvec, bio, i, iter_all) + bio_for_each_segment_all(bvec, bio, iter_all) xfs_finish_page_writeback(inode, bvec, error); bio_put(bio); } @@ -234,11 +233,10 @@ xfs_setfilesize_ioend( * IO write completion. */ STATIC void -xfs_end_io( - struct work_struct *work) +xfs_end_ioend( + struct xfs_ioend *ioend) { - struct xfs_ioend *ioend = - container_of(work, struct xfs_ioend, io_work); + struct list_head ioend_list; struct xfs_inode *ip = XFS_I(ioend->io_inode); xfs_off_t offset = ioend->io_offset; size_t size = ioend->io_size; @@ -275,7 +273,116 @@ xfs_end_io( done: if (ioend->io_append_trans) error = xfs_setfilesize_ioend(ioend, error); + list_replace_init(&ioend->io_list, &ioend_list); xfs_destroy_ioend(ioend, error); + + while (!list_empty(&ioend_list)) { + ioend = list_first_entry(&ioend_list, struct xfs_ioend, + io_list); + list_del_init(&ioend->io_list); + xfs_destroy_ioend(ioend, error); + } +} + +/* + * We can merge two adjacent ioends if they have the same set of work to do. + */ +static bool +xfs_ioend_can_merge( + struct xfs_ioend *ioend, + int ioend_error, + struct xfs_ioend *next) +{ + int next_error; + + next_error = blk_status_to_errno(next->io_bio->bi_status); + if (ioend_error != next_error) + return false; + if ((ioend->io_fork == XFS_COW_FORK) ^ (next->io_fork == XFS_COW_FORK)) + return false; + if ((ioend->io_state == XFS_EXT_UNWRITTEN) ^ + (next->io_state == XFS_EXT_UNWRITTEN)) + return false; + if (ioend->io_offset + ioend->io_size != next->io_offset) + return false; + if (xfs_ioend_is_append(ioend) != xfs_ioend_is_append(next)) + return false; + return true; +} + +/* Try to merge adjacent completions. */ +STATIC void +xfs_ioend_try_merge( + struct xfs_ioend *ioend, + struct list_head *more_ioends) +{ + struct xfs_ioend *next_ioend; + int ioend_error; + int error; + + if (list_empty(more_ioends)) + return; + + ioend_error = blk_status_to_errno(ioend->io_bio->bi_status); + + while (!list_empty(more_ioends)) { + next_ioend = list_first_entry(more_ioends, struct xfs_ioend, + io_list); + if (!xfs_ioend_can_merge(ioend, ioend_error, next_ioend)) + break; + list_move_tail(&next_ioend->io_list, &ioend->io_list); + ioend->io_size += next_ioend->io_size; + if (ioend->io_append_trans) { + error = xfs_setfilesize_ioend(next_ioend, 1); + ASSERT(error == 1); + } + } +} + +/* list_sort compare function for ioends */ +static int +xfs_ioend_compare( + void *priv, + struct list_head *a, + struct list_head *b) +{ + struct xfs_ioend *ia; + struct xfs_ioend *ib; + + ia = container_of(a, struct xfs_ioend, io_list); + ib = container_of(b, struct xfs_ioend, io_list); + if (ia->io_offset < ib->io_offset) + return -1; + else if (ia->io_offset > ib->io_offset) + return 1; + return 0; +} + +/* Finish all pending io completions. */ +void +xfs_end_io( + struct work_struct *work) +{ + struct xfs_inode *ip; + struct xfs_ioend *ioend; + struct list_head completion_list; + unsigned long flags; + + ip = container_of(work, struct xfs_inode, i_ioend_work); + + spin_lock_irqsave(&ip->i_ioend_lock, flags); + list_replace_init(&ip->i_ioend_list, &completion_list); + spin_unlock_irqrestore(&ip->i_ioend_lock, flags); + + list_sort(NULL, &completion_list, xfs_ioend_compare); + + while (!list_empty(&completion_list)) { + ioend = list_first_entry(&completion_list, struct xfs_ioend, + io_list); + list_del_init(&ioend->io_list); + xfs_ioend_try_merge(ioend, &completion_list); + xfs_end_ioend(ioend); + } } STATIC void @@ -283,14 +390,20 @@ xfs_end_bio( struct bio *bio) { struct xfs_ioend *ioend = bio->bi_private; - struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; + struct xfs_inode *ip = XFS_I(ioend->io_inode); + struct xfs_mount *mp = ip->i_mount; + unsigned long flags; if (ioend->io_fork == XFS_COW_FORK || - ioend->io_state == XFS_EXT_UNWRITTEN) - queue_work(mp->m_unwritten_workqueue, &ioend->io_work); - else if (ioend->io_append_trans) - queue_work(mp->m_data_workqueue, &ioend->io_work); - else + ioend->io_state == XFS_EXT_UNWRITTEN || + ioend->io_append_trans != NULL) { + spin_lock_irqsave(&ip->i_ioend_lock, flags); + if (list_empty(&ip->i_ioend_list)) + WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue, + &ip->i_ioend_work)); + list_add_tail(&ioend->io_list, &ip->i_ioend_list); + spin_unlock_irqrestore(&ip->i_ioend_lock, flags); + } else xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status)); } @@ -594,7 +707,6 @@ xfs_alloc_ioend( ioend->io_inode = inode; ioend->io_size = 0; ioend->io_offset = offset; - INIT_WORK(&ioend->io_work, xfs_end_io); ioend->io_append_trans = NULL; ioend->io_bio = bio; return ioend; diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 6c2615b83c5d..f62b03186c62 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -18,7 +18,6 @@ struct xfs_ioend { struct inode *io_inode; /* file being written to */ size_t io_size; /* size of the extent */ xfs_off_t io_offset; /* offset in the file */ - struct work_struct io_work; /* xfsdatad work queue */ struct xfs_trans *io_append_trans;/* xact. for size update */ struct bio *io_bio; /* bio being built */ struct bio io_inline_bio; /* MUST BE LAST! */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 2db43ff4f8b5..06d07f1e310b 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1193,6 +1193,8 @@ xfs_prepare_shift( * about to shift down every extent from offset to EOF. */ error = xfs_flush_unmap_range(ip, offset, XFS_ISIZE(ip)); + if (error) + return error; /* * Clean out anything hanging around in the cow fork now that diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 010db5f8fb00..65b32acfa0f6 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -605,6 +605,8 @@ xfs_buf_item_unlock( #if defined(DEBUG) || defined(XFS_WARN) bool ordered = bip->bli_flags & XFS_BLI_ORDERED; bool dirty = bip->bli_flags & XFS_BLI_DIRTY; + bool aborted = test_bit(XFS_LI_ABORTED, + &lip->li_flags); #endif trace_xfs_buf_item_unlock(bip); @@ -633,7 +635,7 @@ xfs_buf_item_unlock( released = xfs_buf_item_put(bip); if (hold || (stale && !released)) return; - ASSERT(!stale || test_bit(XFS_LI_ABORTED, &lip->li_flags)); + ASSERT(!stale || aborted); xfs_buf_relse(bp); } diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 9ee2a7d02e70..d0df0ed50f4b 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -172,6 +172,8 @@ xfs_ioc_trim( if (copy_from_user(&range, urange, sizeof(range))) return -EFAULT; + range.minlen = max_t(u64, granularity, range.minlen); + minlen = BTOBB(range.minlen); /* * Truncating down the len isn't actually quite correct, but using * BBTOB would mean we trivially get overflows for values @@ -186,7 +188,6 @@ xfs_ioc_trim( start = BTOBB(range.start); end = start + BTOBBT(range.len) - 1; - minlen = BTOBB(max_t(u64, granularity, range.minlen)); if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1) end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)- 1; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 87e6dd5326d5..a1af984e4913 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -277,7 +277,8 @@ xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp) /* * Ensure that the given in-core dquot has a buffer on disk backing it, and - * return the buffer. This is called when the bmapi finds a hole. + * return the buffer locked and held. This is called when the bmapi finds a + * hole. */ STATIC int xfs_dquot_disk_alloc( @@ -355,13 +356,14 @@ xfs_dquot_disk_alloc( * If everything succeeds, the caller of this function is returned a * buffer that is locked and held to the transaction. The caller * is responsible for unlocking any buffer passed back, either - * manually or by committing the transaction. + * manually or by committing the transaction. On error, the buffer is + * released and not passed back. */ xfs_trans_bhold(tp, bp); error = xfs_defer_finish(tpp); - tp = *tpp; if (error) { - xfs_buf_relse(bp); + xfs_trans_bhold_release(*tpp, bp); + xfs_trans_brelse(*tpp, bp); return error; } *bpp = bp; @@ -521,7 +523,6 @@ xfs_qm_dqread_alloc( struct xfs_buf **bpp) { struct xfs_trans *tp; - struct xfs_buf *bp; int error; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc, @@ -529,7 +530,7 @@ xfs_qm_dqread_alloc( if (error) goto err; - error = xfs_dquot_disk_alloc(&tp, dqp, &bp); + error = xfs_dquot_disk_alloc(&tp, dqp, bpp); if (error) goto err_cancel; @@ -539,10 +540,10 @@ xfs_qm_dqread_alloc( * Buffer was held to the transaction, so we have to unlock it * manually here because we're not passing it back. */ - xfs_buf_relse(bp); + xfs_buf_relse(*bpp); + *bpp = NULL; goto err; } - *bpp = bp; return 0; err_cancel: diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index a7ceae90110e..76748255f843 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -517,6 +517,9 @@ xfs_file_dio_aio_write( } if (iocb->ki_flags & IOCB_NOWAIT) { + /* unaligned dio always waits, bail */ + if (unaligned_io) + return -EAGAIN; if (!xfs_ilock_nowait(ip, iolock)) return -EAGAIN; } else { @@ -536,9 +539,6 @@ xfs_file_dio_aio_write( * xfs_file_aio_write_checks() for other reasons. */ if (unaligned_io) { - /* unaligned dio always waits, bail */ - if (iocb->ki_flags & IOCB_NOWAIT) - return -EAGAIN; inode_dio_wait(inode); } else if (iolock == XFS_IOLOCK_EXCL) { xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 584648582ba7..3d0e0570e3aa 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -289,7 +289,7 @@ xfs_growfs_log( * exported through ioctl XFS_IOC_FSCOUNTS */ -int +void xfs_fs_counts( xfs_mount_t *mp, xfs_fsop_counts_t *cnt) @@ -302,7 +302,6 @@ xfs_fs_counts( spin_lock(&mp->m_sb_lock); cnt->freertx = mp->m_sb.sb_frextents; spin_unlock(&mp->m_sb_lock); - return 0; } /* diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index d023db0862c2..92869f6ec8d3 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -8,7 +8,7 @@ extern int xfs_growfs_data(xfs_mount_t *mp, xfs_growfs_data_t *in); extern int xfs_growfs_log(xfs_mount_t *mp, xfs_growfs_log_t *in); -extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt); +extern void xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt); extern int xfs_reserve_blocks(xfs_mount_t *mp, uint64_t *inval, xfs_fsop_resblks_t *outval); extern int xfs_fs_goingdown(xfs_mount_t *mp, uint32_t inflags); diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c new file mode 100644 index 000000000000..4c4929f9e7bf --- /dev/null +++ b/fs/xfs/xfs_health.c @@ -0,0 +1,392 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2019 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_trace.h" +#include "xfs_health.h" + +/* + * Warn about metadata corruption that we detected but haven't fixed, and + * make sure we're not sitting on anything that would get in the way of + * recovery. + */ +void +xfs_health_unmount( + struct xfs_mount *mp) +{ + struct xfs_perag *pag; + xfs_agnumber_t agno; + unsigned int sick = 0; + unsigned int checked = 0; + bool warn = false; + + if (XFS_FORCED_SHUTDOWN(mp)) + return; + + /* Measure AG corruption levels. */ + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + pag = xfs_perag_get(mp, agno); + xfs_ag_measure_sickness(pag, &sick, &checked); + if (sick) { + trace_xfs_ag_unfixed_corruption(mp, agno, sick); + warn = true; + } + xfs_perag_put(pag); + } + + /* Measure realtime volume corruption levels. */ + xfs_rt_measure_sickness(mp, &sick, &checked); + if (sick) { + trace_xfs_rt_unfixed_corruption(mp, sick); + warn = true; + } + + /* + * Measure fs corruption and keep the sample around for the warning. + * See the note below for why we exempt FS_COUNTERS. + */ + xfs_fs_measure_sickness(mp, &sick, &checked); + if (sick & ~XFS_SICK_FS_COUNTERS) { + trace_xfs_fs_unfixed_corruption(mp, sick); + warn = true; + } + + if (warn) { + xfs_warn(mp, +"Uncorrected metadata errors detected; please run xfs_repair."); + + /* + * We discovered uncorrected metadata problems at some point + * during this filesystem mount and have advised the + * administrator to run repair once the unmount completes. + * + * However, we must be careful -- when FSCOUNTERS are flagged + * unhealthy, the unmount procedure omits writing the clean + * unmount record to the log so that the next mount will run + * recovery and recompute the summary counters. In other + * words, we leave a dirty log to get the counters fixed. + * + * Unfortunately, xfs_repair cannot recover dirty logs, so if + * there were filesystem problems, FSCOUNTERS was flagged, and + * the administrator takes our advice to run xfs_repair, + * they'll have to zap the log before repairing structures. + * We don't really want to encourage this, so we mark the + * FSCOUNTERS healthy so that a subsequent repair run won't see + * a dirty log. + */ + if (sick & XFS_SICK_FS_COUNTERS) + xfs_fs_mark_healthy(mp, XFS_SICK_FS_COUNTERS); + } +} + +/* Mark unhealthy per-fs metadata. */ +void +xfs_fs_mark_sick( + struct xfs_mount *mp, + unsigned int mask) +{ + ASSERT(!(mask & ~XFS_SICK_FS_PRIMARY)); + trace_xfs_fs_mark_sick(mp, mask); + + spin_lock(&mp->m_sb_lock); + mp->m_fs_sick |= mask; + mp->m_fs_checked |= mask; + spin_unlock(&mp->m_sb_lock); +} + +/* Mark a per-fs metadata healed. */ +void +xfs_fs_mark_healthy( + struct xfs_mount *mp, + unsigned int mask) +{ + ASSERT(!(mask & ~XFS_SICK_FS_PRIMARY)); + trace_xfs_fs_mark_healthy(mp, mask); + + spin_lock(&mp->m_sb_lock); + mp->m_fs_sick &= ~mask; + mp->m_fs_checked |= mask; + spin_unlock(&mp->m_sb_lock); +} + +/* Sample which per-fs metadata are unhealthy. */ +void +xfs_fs_measure_sickness( + struct xfs_mount *mp, + unsigned int *sick, + unsigned int *checked) +{ + spin_lock(&mp->m_sb_lock); + *sick = mp->m_fs_sick; + *checked = mp->m_fs_checked; + spin_unlock(&mp->m_sb_lock); +} + +/* Mark unhealthy realtime metadata. */ +void +xfs_rt_mark_sick( + struct xfs_mount *mp, + unsigned int mask) +{ + ASSERT(!(mask & ~XFS_SICK_RT_PRIMARY)); + trace_xfs_rt_mark_sick(mp, mask); + + spin_lock(&mp->m_sb_lock); + mp->m_rt_sick |= mask; + mp->m_rt_checked |= mask; + spin_unlock(&mp->m_sb_lock); +} + +/* Mark a realtime metadata healed. */ +void +xfs_rt_mark_healthy( + struct xfs_mount *mp, + unsigned int mask) +{ + ASSERT(!(mask & ~XFS_SICK_RT_PRIMARY)); + trace_xfs_rt_mark_healthy(mp, mask); + + spin_lock(&mp->m_sb_lock); + mp->m_rt_sick &= ~mask; + mp->m_rt_checked |= mask; + spin_unlock(&mp->m_sb_lock); +} + +/* Sample which realtime metadata are unhealthy. */ +void +xfs_rt_measure_sickness( + struct xfs_mount *mp, + unsigned int *sick, + unsigned int *checked) +{ + spin_lock(&mp->m_sb_lock); + *sick = mp->m_rt_sick; + *checked = mp->m_rt_checked; + spin_unlock(&mp->m_sb_lock); +} + +/* Mark unhealthy per-ag metadata. */ +void +xfs_ag_mark_sick( + struct xfs_perag *pag, + unsigned int mask) +{ + ASSERT(!(mask & ~XFS_SICK_AG_PRIMARY)); + trace_xfs_ag_mark_sick(pag->pag_mount, pag->pag_agno, mask); + + spin_lock(&pag->pag_state_lock); + pag->pag_sick |= mask; + pag->pag_checked |= mask; + spin_unlock(&pag->pag_state_lock); +} + +/* Mark per-ag metadata ok. */ +void +xfs_ag_mark_healthy( + struct xfs_perag *pag, + unsigned int mask) +{ + ASSERT(!(mask & ~XFS_SICK_AG_PRIMARY)); + trace_xfs_ag_mark_healthy(pag->pag_mount, pag->pag_agno, mask); + + spin_lock(&pag->pag_state_lock); + pag->pag_sick &= ~mask; + pag->pag_checked |= mask; + spin_unlock(&pag->pag_state_lock); +} + +/* Sample which per-ag metadata are unhealthy. */ +void +xfs_ag_measure_sickness( + struct xfs_perag *pag, + unsigned int *sick, + unsigned int *checked) +{ + spin_lock(&pag->pag_state_lock); + *sick = pag->pag_sick; + *checked = pag->pag_checked; + spin_unlock(&pag->pag_state_lock); +} + +/* Mark the unhealthy parts of an inode. */ +void +xfs_inode_mark_sick( + struct xfs_inode *ip, + unsigned int mask) +{ + ASSERT(!(mask & ~XFS_SICK_INO_PRIMARY)); + trace_xfs_inode_mark_sick(ip, mask); + + spin_lock(&ip->i_flags_lock); + ip->i_sick |= mask; + ip->i_checked |= mask; + spin_unlock(&ip->i_flags_lock); +} + +/* Mark parts of an inode healed. */ +void +xfs_inode_mark_healthy( + struct xfs_inode *ip, + unsigned int mask) +{ + ASSERT(!(mask & ~XFS_SICK_INO_PRIMARY)); + trace_xfs_inode_mark_healthy(ip, mask); + + spin_lock(&ip->i_flags_lock); + ip->i_sick &= ~mask; + ip->i_checked |= mask; + spin_unlock(&ip->i_flags_lock); +} + +/* Sample which parts of an inode are unhealthy. */ +void +xfs_inode_measure_sickness( + struct xfs_inode *ip, + unsigned int *sick, + unsigned int *checked) +{ + spin_lock(&ip->i_flags_lock); + *sick = ip->i_sick; + *checked = ip->i_checked; + spin_unlock(&ip->i_flags_lock); +} + +/* Mappings between internal sick masks and ioctl sick masks. */ + +struct ioctl_sick_map { + unsigned int sick_mask; + unsigned int ioctl_mask; +}; + +static const struct ioctl_sick_map fs_map[] = { + { XFS_SICK_FS_COUNTERS, XFS_FSOP_GEOM_SICK_COUNTERS}, + { XFS_SICK_FS_UQUOTA, XFS_FSOP_GEOM_SICK_UQUOTA }, + { XFS_SICK_FS_GQUOTA, XFS_FSOP_GEOM_SICK_GQUOTA }, + { XFS_SICK_FS_PQUOTA, XFS_FSOP_GEOM_SICK_PQUOTA }, + { 0, 0 }, +}; + +static const struct ioctl_sick_map rt_map[] = { + { XFS_SICK_RT_BITMAP, XFS_FSOP_GEOM_SICK_RT_BITMAP }, + { XFS_SICK_RT_SUMMARY, XFS_FSOP_GEOM_SICK_RT_SUMMARY }, + { 0, 0 }, +}; + +static inline void +xfgeo_health_tick( + struct xfs_fsop_geom *geo, + unsigned int sick, + unsigned int checked, + const struct ioctl_sick_map *m) +{ + if (checked & m->sick_mask) + geo->checked |= m->ioctl_mask; + if (sick & m->sick_mask) + geo->sick |= m->ioctl_mask; +} + +/* Fill out fs geometry health info. */ +void +xfs_fsop_geom_health( + struct xfs_mount *mp, + struct xfs_fsop_geom *geo) +{ + const struct ioctl_sick_map *m; + unsigned int sick; + unsigned int checked; + + geo->sick = 0; + geo->checked = 0; + + xfs_fs_measure_sickness(mp, &sick, &checked); + for (m = fs_map; m->sick_mask; m++) + xfgeo_health_tick(geo, sick, checked, m); + + xfs_rt_measure_sickness(mp, &sick, &checked); + for (m = rt_map; m->sick_mask; m++) + xfgeo_health_tick(geo, sick, checked, m); +} + +static const struct ioctl_sick_map ag_map[] = { + { XFS_SICK_AG_SB, XFS_AG_GEOM_SICK_SB }, + { XFS_SICK_AG_AGF, XFS_AG_GEOM_SICK_AGF }, + { XFS_SICK_AG_AGFL, XFS_AG_GEOM_SICK_AGFL }, + { XFS_SICK_AG_AGI, XFS_AG_GEOM_SICK_AGI }, + { XFS_SICK_AG_BNOBT, XFS_AG_GEOM_SICK_BNOBT }, + { XFS_SICK_AG_CNTBT, XFS_AG_GEOM_SICK_CNTBT }, + { XFS_SICK_AG_INOBT, XFS_AG_GEOM_SICK_INOBT }, + { XFS_SICK_AG_FINOBT, XFS_AG_GEOM_SICK_FINOBT }, + { XFS_SICK_AG_RMAPBT, XFS_AG_GEOM_SICK_RMAPBT }, + { XFS_SICK_AG_REFCNTBT, XFS_AG_GEOM_SICK_REFCNTBT }, + { 0, 0 }, +}; + +/* Fill out ag geometry health info. */ +void +xfs_ag_geom_health( + struct xfs_perag *pag, + struct xfs_ag_geometry *ageo) +{ + const struct ioctl_sick_map *m; + unsigned int sick; + unsigned int checked; + + ageo->ag_sick = 0; + ageo->ag_checked = 0; + + xfs_ag_measure_sickness(pag, &sick, &checked); + for (m = ag_map; m->sick_mask; m++) { + if (checked & m->sick_mask) + ageo->ag_checked |= m->ioctl_mask; + if (sick & m->sick_mask) + ageo->ag_sick |= m->ioctl_mask; + } +} + +static const struct ioctl_sick_map ino_map[] = { + { XFS_SICK_INO_CORE, XFS_BS_SICK_INODE }, + { XFS_SICK_INO_BMBTD, XFS_BS_SICK_BMBTD }, + { XFS_SICK_INO_BMBTA, XFS_BS_SICK_BMBTA }, + { XFS_SICK_INO_BMBTC, XFS_BS_SICK_BMBTC }, + { XFS_SICK_INO_DIR, XFS_BS_SICK_DIR }, + { XFS_SICK_INO_XATTR, XFS_BS_SICK_XATTR }, + { XFS_SICK_INO_SYMLINK, XFS_BS_SICK_SYMLINK }, + { XFS_SICK_INO_PARENT, XFS_BS_SICK_PARENT }, + { 0, 0 }, +}; + +/* Fill out bulkstat health info. */ +void +xfs_bulkstat_health( + struct xfs_inode *ip, + struct xfs_bstat *bs) +{ + const struct ioctl_sick_map *m; + unsigned int sick; + unsigned int checked; + + bs->bs_sick = 0; + bs->bs_checked = 0; + + xfs_inode_measure_sickness(ip, &sick, &checked); + for (m = ino_map; m->sick_mask; m++) { + if (checked & m->sick_mask) + bs->bs_checked |= m->ioctl_mask; + if (sick & m->sick_mask) + bs->bs_sick |= m->ioctl_mask; + } +} diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 245483cc282b..a76b27565a18 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -70,6 +70,11 @@ xfs_inode_alloc( ip->i_flags = 0; ip->i_delayed_blks = 0; memset(&ip->i_d, 0, sizeof(ip->i_d)); + ip->i_sick = 0; + ip->i_checked = 0; + INIT_WORK(&ip->i_ioend_work, xfs_end_io); + INIT_LIST_HEAD(&ip->i_ioend_list); + spin_lock_init(&ip->i_ioend_lock); return ip; } @@ -446,6 +451,8 @@ xfs_iget_cache_hit( ip->i_flags |= XFS_INEW; xfs_inode_clear_reclaim_tag(pag, ip->i_ino); inode->i_state = I_NEW; + ip->i_sick = 0; + ip->i_checked = 0; ASSERT(!rwsem_is_locked(&inode->i_rwsem)); init_rwsem(&inode->i_rwsem); @@ -1815,7 +1822,7 @@ xfs_inode_clear_cowblocks_tag( /* Disable post-EOF and CoW block auto-reclamation. */ void -xfs_icache_disable_reclaim( +xfs_stop_block_reaping( struct xfs_mount *mp) { cancel_delayed_work_sync(&mp->m_eofblocks_work); @@ -1824,7 +1831,7 @@ xfs_icache_disable_reclaim( /* Enable post-EOF and CoW block auto-reclamation. */ void -xfs_icache_enable_reclaim( +xfs_start_block_reaping( struct xfs_mount *mp) { xfs_queue_eofblocks(mp); diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 26c0626f1f75..48f1fd2bb6ad 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -119,7 +119,7 @@ xfs_fs_eofblocks_from_user( int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, bool *inuse); -void xfs_icache_disable_reclaim(struct xfs_mount *mp); -void xfs_icache_enable_reclaim(struct xfs_mount *mp); +void xfs_stop_block_reaping(struct xfs_mount *mp); +void xfs_start_block_reaping(struct xfs_mount *mp); #endif diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index f643a9295179..71d216cf6f87 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1116,7 +1116,7 @@ xfs_droplink( /* * Increment the link count on an inode & log the change. */ -static int +static void xfs_bumplink( xfs_trans_t *tp, xfs_inode_t *ip) @@ -1126,7 +1126,6 @@ xfs_bumplink( ASSERT(ip->i_d.di_version > 1); inc_nlink(VFS_I(ip)); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - return 0; } int @@ -1235,9 +1234,7 @@ xfs_create( if (error) goto out_trans_cancel; - error = xfs_bumplink(tp, dp); - if (error) - goto out_trans_cancel; + xfs_bumplink(tp, dp); } /* @@ -1454,9 +1451,7 @@ xfs_link( xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); - error = xfs_bumplink(tp, sip); - if (error) - goto error_return; + xfs_bumplink(tp, sip); /* * If this is a synchronous mount, make sure that the @@ -3097,9 +3092,7 @@ xfs_cross_rename( error = xfs_droplink(tp, dp2); if (error) goto out_trans_abort; - error = xfs_bumplink(tp, dp1); - if (error) - goto out_trans_abort; + xfs_bumplink(tp, dp1); } /* @@ -3123,9 +3116,7 @@ xfs_cross_rename( error = xfs_droplink(tp, dp1); if (error) goto out_trans_abort; - error = xfs_bumplink(tp, dp2); - if (error) - goto out_trans_abort; + xfs_bumplink(tp, dp2); } /* @@ -3322,9 +3313,7 @@ xfs_rename( XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); if (new_parent && src_is_directory) { - error = xfs_bumplink(tp, target_dp); - if (error) - goto out_trans_cancel; + xfs_bumplink(tp, target_dp); } } else { /* target_ip != NULL */ /* @@ -3443,9 +3432,7 @@ xfs_rename( */ if (wip) { ASSERT(VFS_I(wip)->i_nlink == 0); - error = xfs_bumplink(tp, wip); - if (error) - goto out_trans_cancel; + xfs_bumplink(tp, wip); error = xfs_iunlink_remove(tp, wip); if (error) goto out_trans_cancel; @@ -3614,7 +3601,6 @@ cluster_corrupt_out: * inode buffer and shut down the filesystem. */ rcu_read_unlock(); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); /* * We'll always have an inode attached to the buffer for completion @@ -3624,11 +3610,14 @@ cluster_corrupt_out: * xfs_buf_submit(). */ ASSERT(bp->b_iodone); + bp->b_flags |= XBF_ASYNC; bp->b_flags &= ~XBF_DONE; xfs_buf_stale(bp); xfs_buf_ioerror(bp, -EIO); xfs_buf_ioend(bp); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + /* abort the corrupt inode, as it was not attached to the buffer */ xfs_iflush_abort(cip, false); kmem_free(cilist); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index e62074a5257c..558173f95a03 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -45,10 +45,18 @@ typedef struct xfs_inode { mrlock_t i_lock; /* inode lock */ mrlock_t i_mmaplock; /* inode mmap IO lock */ atomic_t i_pincount; /* inode pin count */ + + /* + * Bitsets of inode metadata that have been checked and/or are sick. + * Callers must hold i_flags_lock before accessing this field. + */ + uint16_t i_checked; + uint16_t i_sick; + spinlock_t i_flags_lock; /* inode i_flags lock */ /* Miscellaneous state. */ unsigned long i_flags; /* see defined flags below */ - unsigned int i_delayed_blks; /* count of delay alloc blks */ + uint64_t i_delayed_blks; /* count of delay alloc blks */ struct xfs_icdinode i_d; /* most of ondisk inode */ @@ -57,6 +65,11 @@ typedef struct xfs_inode { /* VFS inode */ struct inode i_vnode; /* embedded VFS inode */ + + /* pending io completions */ + spinlock_t i_ioend_lock; + struct work_struct i_ioend_work; + struct list_head i_ioend_list; } xfs_inode_t; /* Convert from vfs inode to xfs inode */ @@ -503,4 +516,6 @@ bool xfs_inode_verify_forks(struct xfs_inode *ip); int xfs_iunlink_init(struct xfs_perag *pag); void xfs_iunlink_destroy(struct xfs_perag *pag); +void xfs_end_io(struct work_struct *work); + #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 6ecdbb3af7de..d7dfc13f30f5 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -33,6 +33,8 @@ #include "xfs_fsmap.h" #include "scrub/xfs_scrub.h" #include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_health.h" #include <linux/capability.h> #include <linux/cred.h> @@ -779,40 +781,46 @@ xfs_ioc_bulkstat( } STATIC int -xfs_ioc_fsgeometry_v1( - xfs_mount_t *mp, - void __user *arg) +xfs_ioc_fsgeometry( + struct xfs_mount *mp, + void __user *arg, + int struct_version) { - xfs_fsop_geom_t fsgeo; - int error; + struct xfs_fsop_geom fsgeo; + size_t len; - error = xfs_fs_geometry(&mp->m_sb, &fsgeo, 3); - if (error) - return error; + xfs_fs_geometry(&mp->m_sb, &fsgeo, struct_version); - /* - * Caller should have passed an argument of type - * xfs_fsop_geom_v1_t. This is a proper subset of the - * xfs_fsop_geom_t that xfs_fs_geometry() fills in. - */ - if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t))) + if (struct_version <= 3) + len = sizeof(struct xfs_fsop_geom_v1); + else if (struct_version == 4) + len = sizeof(struct xfs_fsop_geom_v4); + else { + xfs_fsop_geom_health(mp, &fsgeo); + len = sizeof(fsgeo); + } + + if (copy_to_user(arg, &fsgeo, len)) return -EFAULT; return 0; } STATIC int -xfs_ioc_fsgeometry( - xfs_mount_t *mp, +xfs_ioc_ag_geometry( + struct xfs_mount *mp, void __user *arg) { - xfs_fsop_geom_t fsgeo; + struct xfs_ag_geometry ageo; int error; - error = xfs_fs_geometry(&mp->m_sb, &fsgeo, 4); + if (copy_from_user(&ageo, arg, sizeof(ageo))) + return -EFAULT; + + error = xfs_ag_get_geometry(mp, ageo.ag_number, &ageo); if (error) return error; - if (copy_to_user(arg, &fsgeo, sizeof(fsgeo))) + if (copy_to_user(arg, &ageo, sizeof(ageo))) return -EFAULT; return 0; } @@ -1142,7 +1150,7 @@ xfs_ioctl_setattr_get_trans( error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); if (error) - return ERR_PTR(error); + goto out_unlock; xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | join_flags); @@ -1937,10 +1945,14 @@ xfs_file_ioctl( return xfs_ioc_bulkstat(mp, cmd, arg); case XFS_IOC_FSGEOMETRY_V1: - return xfs_ioc_fsgeometry_v1(mp, arg); - + return xfs_ioc_fsgeometry(mp, arg, 3); + case XFS_IOC_FSGEOMETRY_V4: + return xfs_ioc_fsgeometry(mp, arg, 4); case XFS_IOC_FSGEOMETRY: - return xfs_ioc_fsgeometry(mp, arg); + return xfs_ioc_fsgeometry(mp, arg, 5); + + case XFS_IOC_AG_GEOMETRY: + return xfs_ioc_ag_geometry(mp, arg); case XFS_IOC_GETVERSION: return put_user(inode->i_generation, (int __user *)arg); @@ -2031,9 +2043,7 @@ xfs_file_ioctl( case XFS_IOC_FSCOUNTS: { xfs_fsop_counts_t out; - error = xfs_fs_counts(mp, &out); - if (error) - return error; + xfs_fs_counts(mp, &out); if (copy_to_user(arg, &out, sizeof(out))) return -EFAULT; diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 5001dca361e9..614fc6886d24 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -52,12 +52,9 @@ xfs_compat_ioc_fsgeometry_v1( struct xfs_mount *mp, compat_xfs_fsop_geom_v1_t __user *arg32) { - xfs_fsop_geom_t fsgeo; - int error; + struct xfs_fsop_geom fsgeo; - error = xfs_fs_geometry(&mp->m_sb, &fsgeo, 3); - if (error) - return error; + xfs_fs_geometry(&mp->m_sb, &fsgeo, 3); /* The 32-bit variant simply has some padding at the end */ if (copy_to_user(arg32, &fsgeo, sizeof(struct compat_xfs_fsop_geom_v1))) return -EFAULT; @@ -561,7 +558,9 @@ xfs_file_compat_ioctl( switch (cmd) { /* No size or alignment issues on any arch */ case XFS_IOC_DIOINFO: + case XFS_IOC_FSGEOMETRY_V4: case XFS_IOC_FSGEOMETRY: + case XFS_IOC_AG_GEOMETRY: case XFS_IOC_FSGETXATTR: case XFS_IOC_FSSETXATTR: case XFS_IOC_FSGETXATTRA: diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 942e4aa5e729..1e1a0af1dd34 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -18,6 +18,7 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_health.h" /* * Return stat information for one inode. @@ -84,6 +85,7 @@ xfs_bulkstat_one_int( buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog; buf->bs_extents = dic->di_nextents; memset(buf->bs_pad, 0, sizeof(buf->bs_pad)); + xfs_bulkstat_health(ip, buf); buf->bs_dmevmask = dic->di_dmevmask; buf->bs_dmstate = dic->di_dmstate; buf->bs_aextents = dic->di_anextents; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index c3b610b687d1..457ced3ee3e1 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -23,6 +23,7 @@ #include "xfs_cksum.h" #include "xfs_sysfs.h" #include "xfs_sb.h" +#include "xfs_health.h" kmem_zone_t *xfs_log_ticket_zone; @@ -861,7 +862,7 @@ xfs_log_write_unmount_record( * recalculated during log recovery at next mount. Refer to * xlog_check_unmount_rec for more details. */ - if (XFS_TEST_ERROR((mp->m_flags & XFS_MOUNT_BAD_SUMMARY), mp, + if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp, XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { xfs_alert(mp, "%s: will fix summary counters at next mount", __func__); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index d3884e08b43c..5e595948bc5a 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -582,6 +582,19 @@ xlog_cil_committed( struct xfs_cil_ctx *ctx = args; struct xfs_mount *mp = ctx->cil->xc_log->l_mp; + /* + * If the I/O failed, we're aborting the commit and already shutdown. + * Wake any commit waiters before aborting the log items so we don't + * block async log pushers on callbacks. Async log pushers explicitly do + * not wait on log force completion because they may be holding locks + * required to unpin items. + */ + if (abort) { + spin_lock(&ctx->cil->xc_push_lock); + wake_up_all(&ctx->cil->xc_commit_wait); + spin_unlock(&ctx->cil->xc_push_lock); + } + xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain, ctx->start_lsn, abort); @@ -589,15 +602,7 @@ xlog_cil_committed( xfs_extent_busy_clear(mp, &ctx->busy_extents, (mp->m_flags & XFS_MOUNT_DISCARD) && !abort); - /* - * If we are aborting the commit, wake up anyone waiting on the - * committing list. If we don't, then a shutdown we can leave processes - * waiting in xlog_cil_force_lsn() waiting on a sequence commit that - * will never happen because we aborted it. - */ spin_lock(&ctx->cil->xc_push_lock); - if (abort) - wake_up_all(&ctx->cil->xc_commit_wait); list_del(&ctx->committing); spin_unlock(&ctx->cil->xc_push_lock); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 3371d1ff27c4..9329f5adbfbe 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -5167,7 +5167,7 @@ xlog_recover_process_iunlinks( } } -STATIC int +STATIC void xlog_unpack_data( struct xlog_rec_header *rhead, char *dp, @@ -5190,8 +5190,6 @@ xlog_unpack_data( dp += BBSIZE; } } - - return 0; } /* @@ -5206,11 +5204,9 @@ xlog_recover_process( int pass, struct list_head *buffer_list) { - int error; __le32 old_crc = rhead->h_crc; __le32 crc; - crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); /* @@ -5249,9 +5245,7 @@ xlog_recover_process( return -EFSCORRUPTED; } - error = xlog_unpack_data(rhead, dp, log); - if (error) - return error; + xlog_unpack_data(rhead, dp, log); return xlog_recover_process_data(log, rhash, rhead, dp, pass, buffer_list); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index fd63b0b1307c..6b2bfe81dc51 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -34,6 +34,7 @@ #include "xfs_refcount_btree.h" #include "xfs_reflink.h" #include "xfs_extent_busy.h" +#include "xfs_health.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); @@ -231,6 +232,7 @@ xfs_initialize_perag( error = xfs_iunlink_init(pag); if (error) goto out_hash_destroy; + spin_lock_init(&pag->pag_state_lock); } index = xfs_set_inode_alloc(mp, agcount); @@ -644,7 +646,7 @@ xfs_check_summary_counts( (mp->m_sb.sb_fdblocks > mp->m_sb.sb_dblocks || !xfs_verify_icount(mp, mp->m_sb.sb_icount) || mp->m_sb.sb_ifree > mp->m_sb.sb_icount)) - mp->m_flags |= XFS_MOUNT_BAD_SUMMARY; + xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS); /* * We can safely re-initialise incore superblock counters from the @@ -659,7 +661,7 @@ xfs_check_summary_counts( */ if ((!xfs_sb_version_haslazysbcount(&mp->m_sb) || XFS_LAST_UNMOUNT_WAS_CLEAN(mp)) && - !(mp->m_flags & XFS_MOUNT_BAD_SUMMARY)) + !xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS)) return 0; return xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount); @@ -1068,6 +1070,7 @@ xfs_mountfs( */ cancel_delayed_work_sync(&mp->m_reclaim_work); xfs_reclaim_inodes(mp, SYNC_WAIT); + xfs_health_unmount(mp); out_log_dealloc: mp->m_flags |= XFS_MOUNT_UNMOUNTING; xfs_log_mount_cancel(mp); @@ -1104,7 +1107,7 @@ xfs_unmountfs( uint64_t resblks; int error; - xfs_icache_disable_reclaim(mp); + xfs_stop_block_reaping(mp); xfs_fs_unreserve_ag_blocks(mp); xfs_qm_unmount_quotas(mp); xfs_rtunmount_inodes(mp); @@ -1150,6 +1153,7 @@ xfs_unmountfs( */ cancel_delayed_work_sync(&mp->m_reclaim_work); xfs_reclaim_inodes(mp, SYNC_WAIT); + xfs_health_unmount(mp); xfs_qm_unmount(mp); @@ -1445,7 +1449,26 @@ xfs_force_summary_recalc( if (!xfs_sb_version_haslazysbcount(&mp->m_sb)) return; - spin_lock(&mp->m_sb_lock); - mp->m_flags |= XFS_MOUNT_BAD_SUMMARY; - spin_unlock(&mp->m_sb_lock); + xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS); +} + +/* + * Update the in-core delayed block counter. + * + * We prefer to update the counter without having to take a spinlock for every + * counter update (i.e. batching). Each change to delayed allocation + * reservations can change can easily exceed the default percpu counter + * batching, so we use a larger batch factor here. + * + * Note that we don't currently have any callers requiring fast summation + * (e.g. percpu_counter_read) so we can use a big batch value here. + */ +#define XFS_DELALLOC_BATCH (4096) +void +xfs_mod_delalloc( + struct xfs_mount *mp, + int64_t delta) +{ + percpu_counter_add_batch(&mp->m_delalloc_blks, delta, + XFS_DELALLOC_BATCH); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 110f927cf943..c81a5cd7c228 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -60,6 +60,20 @@ struct xfs_error_cfg { typedef struct xfs_mount { struct super_block *m_super; xfs_tid_t m_tid; /* next unused tid for fs */ + + /* + * Bitsets of per-fs metadata that have been checked and/or are sick. + * Callers must hold m_sb_lock to access these two fields. + */ + uint8_t m_fs_checked; + uint8_t m_fs_sick; + /* + * Bitsets of rt metadata that have been checked and/or are sick. + * Callers must hold m_sb_lock to access this field. + */ + uint8_t m_rt_checked; + uint8_t m_rt_sick; + struct xfs_ail *m_ail; /* fs active log item list */ struct xfs_sb m_sb; /* copy of fs superblock */ @@ -67,6 +81,12 @@ typedef struct xfs_mount { struct percpu_counter m_icount; /* allocated inodes counter */ struct percpu_counter m_ifree; /* free inodes counter */ struct percpu_counter m_fdblocks; /* free block counter */ + /* + * Count of data device blocks reserved for delayed allocations, + * including indlen blocks. Does not include allocated CoW staging + * extents or anything related to the rt device. + */ + struct percpu_counter m_delalloc_blks; struct xfs_buf *m_sb_bp; /* buffer for superblock */ char *m_fsname; /* filesystem name */ @@ -175,7 +195,6 @@ typedef struct xfs_mount { struct xstats m_stats; /* per-fs stats */ struct workqueue_struct *m_buf_workqueue; - struct workqueue_struct *m_data_workqueue; struct workqueue_struct *m_unwritten_workqueue; struct workqueue_struct *m_cil_workqueue; struct workqueue_struct *m_reclaim_workqueue; @@ -214,7 +233,6 @@ typedef struct xfs_mount { must be synchronous except for space allocations */ #define XFS_MOUNT_UNMOUNTING (1ULL << 1) /* filesystem is unmounting */ -#define XFS_MOUNT_BAD_SUMMARY (1ULL << 2) /* summary counters are bad */ #define XFS_MOUNT_WAS_CLEAN (1ULL << 3) #define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem operations, typically for @@ -369,6 +387,15 @@ typedef struct xfs_perag { xfs_agino_t pagl_pagino; xfs_agino_t pagl_leftrec; xfs_agino_t pagl_rightrec; + + /* + * Bitsets of per-ag metadata that have been checked and/or are sick. + * Callers should hold pag_state_lock before accessing this field. + */ + uint16_t pag_checked; + uint16_t pag_sick; + spinlock_t pag_state_lock; + spinlock_t pagb_lock; /* lock for pagb_tree */ struct rb_root pagb_tree; /* ordered tree of busy extents */ unsigned int pagb_gen; /* generation count for pagb_tree */ @@ -454,5 +481,6 @@ int xfs_zero_extent(struct xfs_inode *ip, xfs_fsblock_t start_fsb, struct xfs_error_cfg * xfs_error_get_cfg(struct xfs_mount *mp, int error_class, int error); void xfs_force_summary_recalc(struct xfs_mount *mp); +void xfs_mod_delalloc(struct xfs_mount *mp, int64_t delta); #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 52ed7904df10..aa6b6db3db0e 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1812,7 +1812,8 @@ xfs_qm_vop_chown_reserve( uint flags) { struct xfs_mount *mp = ip->i_mount; - uint delblks, blkflags, prjflags = 0; + uint64_t delblks; + unsigned int blkflags, prjflags = 0; struct xfs_dquot *udq_unres = NULL; struct xfs_dquot *gdq_unres = NULL; struct xfs_dquot *pdq_unres = NULL; diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 3ccf0fbc9071..b41b75089548 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -113,12 +113,8 @@ xfs_quota_inode(xfs_mount_t *mp, uint dq_flags) return NULL; } -extern void xfs_trans_mod_dquot(struct xfs_trans *, - struct xfs_dquot *, uint, long); -extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *, - struct xfs_mount *, struct xfs_dquot *, - struct xfs_dquot *, struct xfs_dquot *, - long, long, uint); +extern void xfs_trans_mod_dquot(struct xfs_trans *tp, struct xfs_dquot *dqp, + uint field, int64_t delta); extern void xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *); extern void xfs_trans_log_dquot(struct xfs_trans *, struct xfs_dquot *); diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 55b798265ef7..efe42ae7a2f3 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -56,32 +56,35 @@ xfs_quota_chkd_flag( * The structure kept inside the xfs_trans_t keep track of dquot changes * within a transaction and apply them later. */ -typedef struct xfs_dqtrx { +struct xfs_dqtrx { struct xfs_dquot *qt_dquot; /* the dquot this refers to */ - ulong qt_blk_res; /* blks reserved on a dquot */ - ulong qt_ino_res; /* inode reserved on a dquot */ - ulong qt_ino_res_used; /* inodes used from the reservation */ - long qt_bcount_delta; /* dquot blk count changes */ - long qt_delbcnt_delta; /* delayed dquot blk count changes */ - long qt_icount_delta; /* dquot inode count changes */ - ulong qt_rtblk_res; /* # blks reserved on a dquot */ - ulong qt_rtblk_res_used;/* # blks used from reservation */ - long qt_rtbcount_delta;/* dquot realtime blk changes */ - long qt_delrtb_delta; /* delayed RT blk count changes */ -} xfs_dqtrx_t; + + uint64_t qt_blk_res; /* blks reserved on a dquot */ + int64_t qt_bcount_delta; /* dquot blk count changes */ + int64_t qt_delbcnt_delta; /* delayed dquot blk count changes */ + + uint64_t qt_rtblk_res; /* # blks reserved on a dquot */ + uint64_t qt_rtblk_res_used;/* # blks used from reservation */ + int64_t qt_rtbcount_delta;/* dquot realtime blk changes */ + int64_t qt_delrtb_delta; /* delayed RT blk count changes */ + + uint64_t qt_ino_res; /* inode reserved on a dquot */ + uint64_t qt_ino_res_used; /* inodes used from the reservation */ + int64_t qt_icount_delta; /* dquot inode count changes */ +}; #ifdef CONFIG_XFS_QUOTA extern void xfs_trans_dup_dqinfo(struct xfs_trans *, struct xfs_trans *); extern void xfs_trans_free_dqinfo(struct xfs_trans *); extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *, - uint, long); + uint, int64_t); extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *); extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *); extern int xfs_trans_reserve_quota_nblks(struct xfs_trans *, - struct xfs_inode *, long, long, uint); + struct xfs_inode *, int64_t, long, uint); extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *, struct xfs_mount *, struct xfs_dquot *, - struct xfs_dquot *, struct xfs_dquot *, long, long, uint); + struct xfs_dquot *, struct xfs_dquot *, int64_t, long, uint); extern int xfs_qm_vop_dqalloc(struct xfs_inode *, xfs_dqid_t, xfs_dqid_t, prid_t, uint, struct xfs_dquot **, struct xfs_dquot **, @@ -121,14 +124,14 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, xfs_dqid_t uid, xfs_dqid_t gid, #define xfs_trans_apply_dquot_deltas(tp) #define xfs_trans_unreserve_and_mod_dquots(tp) static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, - struct xfs_inode *ip, long nblks, long ninos, uint flags) + struct xfs_inode *ip, int64_t nblks, long ninos, uint flags) { return 0; } static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, struct xfs_mount *mp, struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, - long nblks, long nions, uint flags) + int64_t nblks, long nions, uint flags) { return 0; } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index f093ea244849..a14d11d78bd8 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -66,7 +66,7 @@ static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */ enum { Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, Opt_biosize, Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid, - Opt_mtpt, Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups, + Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups, Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32, Opt_ikeep, Opt_noikeep, Opt_largeio, Opt_nolargeio, Opt_attr2, Opt_noattr2, Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, @@ -87,7 +87,6 @@ static const match_table_t tokens = { {Opt_sunit, "sunit=%u"}, /* data volume stripe unit */ {Opt_swidth, "swidth=%u"}, /* data volume stripe width */ {Opt_nouuid, "nouuid"}, /* ignore filesystem UUID */ - {Opt_mtpt, "mtpt"}, /* filesystem mount point */ {Opt_grpid, "grpid"}, /* group-ID from parent directory */ {Opt_nogrpid, "nogrpid"}, /* group-ID from current process */ {Opt_bsdgroups, "bsdgroups"}, /* group-ID from parent directory */ @@ -236,9 +235,6 @@ xfs_parseargs( if (!mp->m_logname) return -ENOMEM; break; - case Opt_mtpt: - xfs_warn(mp, "%s option not allowed on this system", p); - return -EINVAL; case Opt_rtdev: kfree(mp->m_rtname); mp->m_rtname = match_strdup(args); @@ -448,7 +444,7 @@ struct proc_xfs_info { char *str; }; -STATIC int +STATIC void xfs_showargs( struct xfs_mount *mp, struct seq_file *m) @@ -527,9 +523,8 @@ xfs_showargs( if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT)) seq_puts(m, ",noquota"); - - return 0; } + static uint64_t xfs_max_file_offset( unsigned int blockshift) @@ -539,26 +534,18 @@ xfs_max_file_offset( /* Figure out maximum filesize, on Linux this can depend on * the filesystem blocksize (on 32 bit platforms). - * __block_write_begin does this in an [unsigned] long... + * __block_write_begin does this in an [unsigned] long long... * page->index << (PAGE_SHIFT - bbits) * So, for page sized blocks (4K on 32 bit platforms), * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is * (((u64)PAGE_SIZE << (BITS_PER_LONG-1))-1) * but for smaller blocksizes it is less (bbits = log2 bsize). - * Note1: get_block_t takes a long (implicit cast from above) - * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch - * can optionally convert the [unsigned] long from above into - * an [unsigned] long long. */ #if BITS_PER_LONG == 32 -# if defined(CONFIG_LBDAF) ASSERT(sizeof(sector_t) == 8); pagefactor = PAGE_SIZE; bitshift = BITS_PER_LONG; -# else - pagefactor = PAGE_SIZE >> (PAGE_SHIFT - blockshift); -# endif #endif return (((uint64_t)pagefactor) << bitshift) - 1; @@ -838,15 +825,10 @@ xfs_init_mount_workqueues( if (!mp->m_buf_workqueue) goto out; - mp->m_data_workqueue = alloc_workqueue("xfs-data/%s", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); - if (!mp->m_data_workqueue) - goto out_destroy_buf; - mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s", WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); if (!mp->m_unwritten_workqueue) - goto out_destroy_data_iodone_queue; + goto out_destroy_buf; mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s", WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); @@ -886,8 +868,6 @@ out_destroy_cil: destroy_workqueue(mp->m_cil_workqueue); out_destroy_unwritten: destroy_workqueue(mp->m_unwritten_workqueue); -out_destroy_data_iodone_queue: - destroy_workqueue(mp->m_data_workqueue); out_destroy_buf: destroy_workqueue(mp->m_buf_workqueue); out: @@ -903,7 +883,6 @@ xfs_destroy_mount_workqueues( destroy_workqueue(mp->m_log_workqueue); destroy_workqueue(mp->m_reclaim_workqueue); destroy_workqueue(mp->m_cil_workqueue); - destroy_workqueue(mp->m_data_workqueue); destroy_workqueue(mp->m_unwritten_workqueue); destroy_workqueue(mp->m_buf_workqueue); } @@ -1376,7 +1355,7 @@ xfs_fs_remount( xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return error; } - xfs_icache_enable_reclaim(mp); + xfs_start_block_reaping(mp); /* Create the per-AG metadata reservation pool .*/ error = xfs_fs_reserve_ag_blocks(mp); @@ -1390,7 +1369,7 @@ xfs_fs_remount( * Cancel background eofb scanning so it cannot race with the * final log force+buftarg wait and deadlock the remount. */ - xfs_icache_disable_reclaim(mp); + xfs_stop_block_reaping(mp); /* Get rid of any leftover CoW reservations... */ error = xfs_icache_free_cowblocks(mp, NULL); @@ -1434,7 +1413,7 @@ xfs_fs_freeze( { struct xfs_mount *mp = XFS_M(sb); - xfs_icache_disable_reclaim(mp); + xfs_stop_block_reaping(mp); xfs_save_resvblks(mp); xfs_quiesce_attr(mp); return xfs_sync_sb(mp, true); @@ -1448,7 +1427,7 @@ xfs_fs_unfreeze( xfs_restore_resvblks(mp); xfs_log_work_queue(mp); - xfs_icache_enable_reclaim(mp); + xfs_start_block_reaping(mp); return 0; } @@ -1457,7 +1436,8 @@ xfs_fs_show_options( struct seq_file *m, struct dentry *root) { - return xfs_showargs(XFS_M(root->d_sb), m); + xfs_showargs(XFS_M(root->d_sb), m); + return 0; } /* @@ -1546,8 +1526,14 @@ xfs_init_percpu_counters( if (error) goto free_ifree; + error = percpu_counter_init(&mp->m_delalloc_blks, 0, GFP_KERNEL); + if (error) + goto free_fdblocks; + return 0; +free_fdblocks: + percpu_counter_destroy(&mp->m_fdblocks); free_ifree: percpu_counter_destroy(&mp->m_ifree); free_icount: @@ -1571,6 +1557,9 @@ xfs_destroy_percpu_counters( percpu_counter_destroy(&mp->m_icount); percpu_counter_destroy(&mp->m_ifree); percpu_counter_destroy(&mp->m_fdblocks); + ASSERT(XFS_FORCED_SHUTDOWN(mp) || + percpu_counter_sum(&mp->m_delalloc_blks) == 0); + percpu_counter_destroy(&mp->m_delalloc_blks); } static struct xfs_mount * diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 47fb07d86efd..2464ea351f83 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3440,6 +3440,82 @@ DEFINE_AGINODE_EVENT(xfs_iunlink); DEFINE_AGINODE_EVENT(xfs_iunlink_remove); DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback); +DECLARE_EVENT_CLASS(xfs_fs_corrupt_class, + TP_PROTO(struct xfs_mount *mp, unsigned int flags), + TP_ARGS(mp, flags), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->flags = flags; + ), + TP_printk("dev %d:%d flags 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->flags) +); +#define DEFINE_FS_CORRUPT_EVENT(name) \ +DEFINE_EVENT(xfs_fs_corrupt_class, name, \ + TP_PROTO(struct xfs_mount *mp, unsigned int flags), \ + TP_ARGS(mp, flags)) +DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_sick); +DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_healthy); +DEFINE_FS_CORRUPT_EVENT(xfs_fs_unfixed_corruption); +DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_sick); +DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_healthy); +DEFINE_FS_CORRUPT_EVENT(xfs_rt_unfixed_corruption); + +DECLARE_EVENT_CLASS(xfs_ag_corrupt_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int flags), + TP_ARGS(mp, agno, flags), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->flags = flags; + ), + TP_printk("dev %d:%d agno %u flags 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, __entry->flags) +); +#define DEFINE_AG_CORRUPT_EVENT(name) \ +DEFINE_EVENT(xfs_ag_corrupt_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + unsigned int flags), \ + TP_ARGS(mp, agno, flags)) +DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_sick); +DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_healthy); +DEFINE_AG_CORRUPT_EVENT(xfs_ag_unfixed_corruption); + +DECLARE_EVENT_CLASS(xfs_inode_corrupt_class, + TP_PROTO(struct xfs_inode *ip, unsigned int flags), + TP_ARGS(ip, flags), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->flags = flags; + ), + TP_printk("dev %d:%d ino 0x%llx flags 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, __entry->flags) +); +#define DEFINE_INODE_CORRUPT_EVENT(name) \ +DEFINE_EVENT(xfs_inode_corrupt_class, name, \ + TP_PROTO(struct xfs_inode *ip, unsigned int flags), \ + TP_ARGS(ip, flags)) +DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_sick); +DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_healthy); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index c23257a26c2b..cd664a03613f 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -74,13 +74,13 @@ xfs_trans_log_dquot( */ void xfs_trans_dup_dqinfo( - xfs_trans_t *otp, - xfs_trans_t *ntp) + struct xfs_trans *otp, + struct xfs_trans *ntp) { - xfs_dqtrx_t *oq, *nq; - int i, j; - xfs_dqtrx_t *oqa, *nqa; - ulong blk_res_used; + struct xfs_dqtrx *oq, *nq; + int i, j; + struct xfs_dqtrx *oqa, *nqa; + uint64_t blk_res_used; if (!otp->t_dqinfo) return; @@ -137,7 +137,7 @@ xfs_trans_mod_dquot_byino( xfs_trans_t *tp, xfs_inode_t *ip, uint field, - long delta) + int64_t delta) { xfs_mount_t *mp = tp->t_mountp; @@ -191,12 +191,12 @@ xfs_trans_get_dqtrx( */ void xfs_trans_mod_dquot( - xfs_trans_t *tp, - xfs_dquot_t *dqp, - uint field, - long delta) + struct xfs_trans *tp, + struct xfs_dquot *dqp, + uint field, + int64_t delta) { - xfs_dqtrx_t *qtrx; + struct xfs_dqtrx *qtrx; ASSERT(tp); ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp)); @@ -219,14 +219,14 @@ xfs_trans_mod_dquot( * regular disk blk reservation */ case XFS_TRANS_DQ_RES_BLKS: - qtrx->qt_blk_res += (ulong)delta; + qtrx->qt_blk_res += delta; break; /* * inode reservation */ case XFS_TRANS_DQ_RES_INOS: - qtrx->qt_ino_res += (ulong)delta; + qtrx->qt_ino_res += delta; break; /* @@ -245,7 +245,7 @@ xfs_trans_mod_dquot( */ case XFS_TRANS_DQ_ICOUNT: if (qtrx->qt_ino_res && delta > 0) { - qtrx->qt_ino_res_used += (ulong)delta; + qtrx->qt_ino_res_used += delta; ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used); } qtrx->qt_icount_delta += delta; @@ -255,7 +255,7 @@ xfs_trans_mod_dquot( * rtblk reservation */ case XFS_TRANS_DQ_RES_RTBLKS: - qtrx->qt_rtblk_res += (ulong)delta; + qtrx->qt_rtblk_res += delta; break; /* @@ -263,7 +263,7 @@ xfs_trans_mod_dquot( */ case XFS_TRANS_DQ_RTBCOUNT: if (qtrx->qt_rtblk_res && delta > 0) { - qtrx->qt_rtblk_res_used += (ulong)delta; + qtrx->qt_rtblk_res_used += delta; ASSERT(qtrx->qt_rtblk_res >= qtrx->qt_rtblk_res_used); } qtrx->qt_rtbcount_delta += delta; @@ -288,8 +288,8 @@ xfs_trans_mod_dquot( */ STATIC void xfs_trans_dqlockedjoin( - xfs_trans_t *tp, - xfs_dqtrx_t *q) + struct xfs_trans *tp, + struct xfs_dqtrx *q) { ASSERT(q[0].qt_dquot != NULL); if (q[1].qt_dquot == NULL) { @@ -320,8 +320,8 @@ xfs_trans_apply_dquot_deltas( struct xfs_dquot *dqp; struct xfs_dqtrx *qtrx, *qa; struct xfs_disk_dquot *d; - long totalbdelta; - long totalrtbdelta; + int64_t totalbdelta; + int64_t totalrtbdelta; if (!(tp->t_flags & XFS_TRANS_DQ_DIRTY)) return; @@ -413,7 +413,7 @@ xfs_trans_apply_dquot_deltas( * reservation that a transaction structure knows of. */ if (qtrx->qt_blk_res != 0) { - ulong blk_res_used = 0; + uint64_t blk_res_used = 0; if (qtrx->qt_bcount_delta > 0) blk_res_used = qtrx->qt_bcount_delta; @@ -501,7 +501,7 @@ xfs_trans_unreserve_and_mod_dquots( { int i, j; xfs_dquot_t *dqp; - xfs_dqtrx_t *qtrx, *qa; + struct xfs_dqtrx *qtrx, *qa; bool locked; if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY)) @@ -585,7 +585,7 @@ xfs_trans_dqresv( xfs_trans_t *tp, xfs_mount_t *mp, xfs_dquot_t *dqp, - long nblks, + int64_t nblks, long ninos, uint flags) { @@ -745,7 +745,7 @@ xfs_trans_reserve_quota_bydquots( struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, - long nblks, + int64_t nblks, long ninos, uint flags) { @@ -804,7 +804,7 @@ int xfs_trans_reserve_quota_nblks( struct xfs_trans *tp, struct xfs_inode *ip, - long nblks, + int64_t nblks, long ninos, uint flags) { |