diff options
Diffstat (limited to 'fs')
384 files changed, 9339 insertions, 6433 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c index 64c58eb26159..9eb34701a566 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -55,42 +55,27 @@ int v9fs_random_cachetag(struct v9fs_session_info *v9ses) return scnprintf(v9ses->cachetag, CACHETAG_LEN, "%lu", jiffies); } -static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - struct v9fs_session_info *v9ses; - uint16_t klen = 0; - - v9ses = (struct v9fs_session_info *)cookie_netfs_data; - p9_debug(P9_DEBUG_FSC, "session %p buf %p size %u\n", - v9ses, buffer, bufmax); - - if (v9ses->cachetag) - klen = strlen(v9ses->cachetag); - - if (klen > bufmax) - return 0; - - memcpy(buffer, v9ses->cachetag, klen); - p9_debug(P9_DEBUG_FSC, "cache session tag %s\n", v9ses->cachetag); - return klen; -} - const struct fscache_cookie_def v9fs_cache_session_index_def = { .name = "9P.session", .type = FSCACHE_COOKIE_TYPE_INDEX, - .get_key = v9fs_cache_session_get_key, }; void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses) { /* If no cache session tag was specified, we generate a random one. */ - if (!v9ses->cachetag) - v9fs_random_cachetag(v9ses); + if (!v9ses->cachetag) { + if (v9fs_random_cachetag(v9ses) < 0) { + v9ses->fscache = NULL; + return; + } + } v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index, &v9fs_cache_session_index_def, - v9ses, true); + v9ses->cachetag, + strlen(v9ses->cachetag), + NULL, 0, + v9ses, 0, true); p9_debug(P9_DEBUG_FSC, "session %p get cookie %p\n", v9ses, v9ses->fscache); } @@ -99,45 +84,15 @@ void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses) { p9_debug(P9_DEBUG_FSC, "session %p put cookie %p\n", v9ses, v9ses->fscache); - fscache_relinquish_cookie(v9ses->fscache, 0); + fscache_relinquish_cookie(v9ses->fscache, NULL, false); v9ses->fscache = NULL; } - -static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - const struct v9fs_inode *v9inode = cookie_netfs_data; - memcpy(buffer, &v9inode->qid.path, sizeof(v9inode->qid.path)); - p9_debug(P9_DEBUG_FSC, "inode %p get key %llu\n", - &v9inode->vfs_inode, v9inode->qid.path); - return sizeof(v9inode->qid.path); -} - -static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data, - uint64_t *size) -{ - const struct v9fs_inode *v9inode = cookie_netfs_data; - *size = i_size_read(&v9inode->vfs_inode); - - p9_debug(P9_DEBUG_FSC, "inode %p get attr %llu\n", - &v9inode->vfs_inode, *size); -} - -static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data, - void *buffer, uint16_t buflen) -{ - const struct v9fs_inode *v9inode = cookie_netfs_data; - memcpy(buffer, &v9inode->qid.version, sizeof(v9inode->qid.version)); - p9_debug(P9_DEBUG_FSC, "inode %p get aux %u\n", - &v9inode->vfs_inode, v9inode->qid.version); - return sizeof(v9inode->qid.version); -} - static enum fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data, const void *buffer, - uint16_t buflen) + uint16_t buflen, + loff_t object_size) { const struct v9fs_inode *v9inode = cookie_netfs_data; @@ -154,9 +109,6 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data, const struct fscache_cookie_def v9fs_cache_inode_index_def = { .name = "9p.inode", .type = FSCACHE_COOKIE_TYPE_DATAFILE, - .get_key = v9fs_cache_inode_get_key, - .get_attr = v9fs_cache_inode_get_attr, - .get_aux = v9fs_cache_inode_get_aux, .check_aux = v9fs_cache_inode_check_aux, }; @@ -175,7 +127,13 @@ void v9fs_cache_inode_get_cookie(struct inode *inode) v9ses = v9fs_inode2v9ses(inode); v9inode->fscache = fscache_acquire_cookie(v9ses->fscache, &v9fs_cache_inode_index_def, - v9inode, true); + &v9inode->qid.path, + sizeof(v9inode->qid.path), + &v9inode->qid.version, + sizeof(v9inode->qid.version), + v9inode, + i_size_read(&v9inode->vfs_inode), + true); p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n", inode, v9inode->fscache); @@ -190,7 +148,8 @@ void v9fs_cache_inode_put_cookie(struct inode *inode) p9_debug(P9_DEBUG_FSC, "inode %p put cookie %p\n", inode, v9inode->fscache); - fscache_relinquish_cookie(v9inode->fscache, 0); + fscache_relinquish_cookie(v9inode->fscache, &v9inode->qid.version, + false); v9inode->fscache = NULL; } @@ -203,7 +162,7 @@ void v9fs_cache_inode_flush_cookie(struct inode *inode) p9_debug(P9_DEBUG_FSC, "inode %p flush cookie %p\n", inode, v9inode->fscache); - fscache_relinquish_cookie(v9inode->fscache, 1); + fscache_relinquish_cookie(v9inode->fscache, NULL, true); v9inode->fscache = NULL; } @@ -236,12 +195,18 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode) old = v9inode->fscache; mutex_lock(&v9inode->fscache_lock); - fscache_relinquish_cookie(v9inode->fscache, 1); + fscache_relinquish_cookie(v9inode->fscache, NULL, true); v9ses = v9fs_inode2v9ses(inode); v9inode->fscache = fscache_acquire_cookie(v9ses->fscache, &v9fs_cache_inode_index_def, - v9inode, true); + &v9inode->qid.path, + sizeof(v9inode->qid.path), + &v9inode->qid.version, + sizeof(v9inode->qid.version), + v9inode, + i_size_read(&v9inode->vfs_inode), + true); p9_debug(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p\n", inode, old, v9inode->fscache); @@ -367,7 +332,8 @@ void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page) const struct v9fs_inode *v9inode = V9FS_I(inode); p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page); - ret = fscache_write_page(v9inode->fscache, page, GFP_KERNEL); + ret = fscache_write_page(v9inode->fscache, page, + i_size_read(&v9inode->vfs_inode), GFP_KERNEL); p9_debug(P9_DEBUG_FSC, "ret = %d\n", ret); if (ret != 0) v9fs_uncache_page(inode, page); diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 8fb89ddc6cc7..e622f0f10502 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -292,6 +292,10 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) #ifdef CONFIG_9P_FSCACHE kfree(v9ses->cachetag); v9ses->cachetag = match_strdup(&args[0]); + if (!v9ses->cachetag) { + ret = -ENOMEM; + goto free_and_return; + } #endif break; case Opt_cache: @@ -471,6 +475,9 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, return fid; err_clnt: +#ifdef CONFIG_9P_FSCACHE + kfree(v9ses->cachetag); +#endif p9_client_destroy(v9ses->clnt); err_names: kfree(v9ses->uname); diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index bdabb2765d1b..9ee534159cc6 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -579,6 +579,24 @@ static int v9fs_at_to_dotl_flags(int flags) } /** + * v9fs_dec_count - helper functon to drop i_nlink. + * + * If a directory had nlink <= 2 (including . and ..), then we should not drop + * the link count, which indicates the underlying exported fs doesn't maintain + * nlink accurately. e.g. + * - overlayfs sets nlink to 1 for merged dir + * - ext4 (with dir_nlink feature enabled) sets nlink to 1 if a dir has more + * than EXT4_LINK_MAX (65000) links. + * + * @inode: inode whose nlink is being dropped + */ +static void v9fs_dec_count(struct inode *inode) +{ + if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) + drop_nlink(inode); +} + +/** * v9fs_remove - helper function to remove files and directories * @dir: directory inode that is being deleted * @dentry: dentry that is being deleted @@ -621,9 +639,9 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags) */ if (flags & AT_REMOVEDIR) { clear_nlink(inode); - drop_nlink(dir); + v9fs_dec_count(dir); } else - drop_nlink(inode); + v9fs_dec_count(inode); v9fs_invalidate_inode_attr(inode); v9fs_invalidate_inode_attr(dir); @@ -1024,12 +1042,12 @@ clunk_newdir: if (S_ISDIR(new_inode->i_mode)) clear_nlink(new_inode); else - drop_nlink(new_inode); + v9fs_dec_count(new_inode); } if (S_ISDIR(old_inode->i_mode)) { if (!new_inode) inc_nlink(new_dir); - drop_nlink(old_dir); + v9fs_dec_count(old_dir); } v9fs_invalidate_inode_attr(old_inode); v9fs_invalidate_inode_attr(old_dir); diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index af03c2a901eb..48ce50484e80 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -94,7 +94,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, if (v9ses->cache) sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE; - sb->s_flags |= SB_ACTIVE | SB_DIRSYNC | SB_NOATIME; + sb->s_flags |= SB_ACTIVE | SB_DIRSYNC; if (!v9ses->cache) sb->s_flags |= SB_SYNCHRONOUS; diff --git a/fs/afs/Makefile b/fs/afs/Makefile index 45b7fc405fa6..532acae25453 100644 --- a/fs/afs/Makefile +++ b/fs/afs/Makefile @@ -12,6 +12,8 @@ kafs-objs := \ cell.o \ cmservice.o \ dir.o \ + dir_edit.o \ + dynroot.o \ file.o \ flock.o \ fsclient.o \ diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c index fd9f28b8a933..3bedfed608a2 100644 --- a/fs/afs/addr_list.c +++ b/fs/afs/addr_list.c @@ -243,9 +243,9 @@ void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port) xport == a->sin6_port) return; if (xdr == a->sin6_addr.s6_addr32[3] && - xport < a->sin6_port) + (u16 __force)xport < (u16 __force)a->sin6_port) break; - if (xdr < a->sin6_addr.s6_addr32[3]) + if ((u32 __force)xdr < (u32 __force)a->sin6_addr.s6_addr32[3]) break; } @@ -280,7 +280,7 @@ void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port) xport == a->sin6_port) return; if (diff == 0 && - xport < a->sin6_port) + (u16 __force)xport < (u16 __force)a->sin6_port) break; if (diff < 0) break; diff --git a/fs/afs/afs.h b/fs/afs/afs.h index b94d0edc2b78..b4ff1f7ae4ab 100644 --- a/fs/afs/afs.h +++ b/fs/afs/afs.h @@ -67,10 +67,14 @@ typedef enum { } afs_callback_type_t; struct afs_callback { - struct afs_fid fid; /* file identifier */ - unsigned version; /* callback version */ - unsigned expiry; /* time at which expires */ - afs_callback_type_t type; /* type of callback */ + unsigned version; /* Callback version */ + unsigned expiry; /* Time at which expires */ + afs_callback_type_t type; /* Type of callback */ +}; + +struct afs_callback_break { + struct afs_fid fid; /* File identifier */ + struct afs_callback cb; /* Callback details */ }; #define AFSCBMAX 50 /* maximum callbacks transferred per bulk op */ @@ -123,21 +127,20 @@ typedef u32 afs_access_t; * AFS file status information */ struct afs_file_status { - unsigned if_version; /* interface version */ -#define AFS_FSTATUS_VERSION 1 + u64 size; /* file size */ + afs_dataversion_t data_version; /* current data version */ + time_t mtime_client; /* last time client changed data */ + time_t mtime_server; /* last time server changed data */ + unsigned abort_code; /* Abort if bulk-fetching this failed */ afs_file_type_t type; /* file type */ unsigned nlink; /* link count */ - u64 size; /* file size */ - afs_dataversion_t data_version; /* current data version */ u32 author; /* author ID */ - kuid_t owner; /* owner ID */ - kgid_t group; /* group ID */ + u32 owner; /* owner ID */ + u32 group; /* group ID */ afs_access_t caller_access; /* access rights for authenticated caller */ afs_access_t anon_access; /* access rights for unauthenticated caller */ umode_t mode; /* UNIX mode */ - time_t mtime_client; /* last time client changed data */ - time_t mtime_server; /* last time server changed data */ s32 lock_count; /* file lock count (0=UNLK -1=WRLCK +ve=#RDLCK */ }; diff --git a/fs/afs/afs_fs.h b/fs/afs/afs_fs.h index d47b6d01e4c0..ddfa88a7a9c0 100644 --- a/fs/afs/afs_fs.h +++ b/fs/afs/afs_fs.h @@ -31,10 +31,12 @@ enum AFS_FS_Operations { FSGETVOLUMEINFO = 148, /* AFS Get information about a volume */ FSGETVOLUMESTATUS = 149, /* AFS Get volume status information */ FSGETROOTVOLUME = 151, /* AFS Get root volume name */ + FSBULKSTATUS = 155, /* AFS Fetch multiple file statuses */ FSSETLOCK = 156, /* AFS Request a file lock */ FSEXTENDLOCK = 157, /* AFS Extend a file lock */ FSRELEASELOCK = 158, /* AFS Release a file lock */ FSLOOKUP = 161, /* AFS lookup file in directory */ + FSINLINEBULKSTATUS = 65536, /* AFS Fetch multiple file statuses with inline errors */ FSFETCHDATA64 = 65537, /* AFS Fetch file data */ FSSTOREDATA64 = 65538, /* AFS Store file data */ FSGIVEUPALLCALLBACKS = 65539, /* AFS Give up all outstanding callbacks on a server */ diff --git a/fs/afs/cache.c b/fs/afs/cache.c index f62ff71d28c9..b1c31ec4523a 100644 --- a/fs/afs/cache.c +++ b/fs/afs/cache.c @@ -12,167 +12,39 @@ #include <linux/sched.h> #include "internal.h" -static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t buflen); -static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t buflen); - -static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t buflen); -static void afs_vnode_cache_get_attr(const void *cookie_netfs_data, - uint64_t *size); -static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data, - void *buffer, uint16_t buflen); static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, const void *buffer, - uint16_t buflen); + uint16_t buflen, + loff_t object_size); struct fscache_netfs afs_cache_netfs = { .name = "afs", - .version = 1, + .version = 2, }; struct fscache_cookie_def afs_cell_cache_index_def = { .name = "AFS.cell", .type = FSCACHE_COOKIE_TYPE_INDEX, - .get_key = afs_cell_cache_get_key, }; struct fscache_cookie_def afs_volume_cache_index_def = { .name = "AFS.volume", .type = FSCACHE_COOKIE_TYPE_INDEX, - .get_key = afs_volume_cache_get_key, }; struct fscache_cookie_def afs_vnode_cache_index_def = { - .name = "AFS.vnode", - .type = FSCACHE_COOKIE_TYPE_DATAFILE, - .get_key = afs_vnode_cache_get_key, - .get_attr = afs_vnode_cache_get_attr, - .get_aux = afs_vnode_cache_get_aux, - .check_aux = afs_vnode_cache_check_aux, + .name = "AFS.vnode", + .type = FSCACHE_COOKIE_TYPE_DATAFILE, + .check_aux = afs_vnode_cache_check_aux, }; /* - * set the key for the index entry - */ -static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - const struct afs_cell *cell = cookie_netfs_data; - uint16_t klen; - - _enter("%p,%p,%u", cell, buffer, bufmax); - - klen = strlen(cell->name); - if (klen > bufmax) - return 0; - - memcpy(buffer, cell->name, klen); - return klen; -} - -/*****************************************************************************/ -/* - * set the key for the volume index entry - */ -static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - const struct afs_volume *volume = cookie_netfs_data; - struct { - u64 volid; - } __packed key; - - _enter("{%u},%p,%u", volume->type, buffer, bufmax); - - if (bufmax < sizeof(key)) - return 0; - - key.volid = volume->vid; - memcpy(buffer, &key, sizeof(key)); - return sizeof(key); -} - -/*****************************************************************************/ -/* - * set the key for the index entry - */ -static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - const struct afs_vnode *vnode = cookie_netfs_data; - struct { - u32 vnode_id[3]; - } __packed key; - - _enter("{%x,%x,%llx},%p,%u", - vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version, - buffer, bufmax); - - /* Allow for a 96-bit key */ - memset(&key, 0, sizeof(key)); - key.vnode_id[0] = vnode->fid.vnode; - key.vnode_id[1] = 0; - key.vnode_id[2] = 0; - - if (sizeof(key) > bufmax) - return 0; - - memcpy(buffer, &key, sizeof(key)); - return sizeof(key); -} - -/* - * provide updated file attributes - */ -static void afs_vnode_cache_get_attr(const void *cookie_netfs_data, - uint64_t *size) -{ - const struct afs_vnode *vnode = cookie_netfs_data; - - _enter("{%x,%x,%llx},", - vnode->fid.vnode, vnode->fid.unique, - vnode->status.data_version); - - *size = vnode->status.size; -} - -struct afs_vnode_cache_aux { - u64 data_version; - u32 fid_unique; -} __packed; - -/* - * provide new auxiliary cache data - */ -static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - const struct afs_vnode *vnode = cookie_netfs_data; - struct afs_vnode_cache_aux aux; - - _enter("{%x,%x,%Lx},%p,%u", - vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version, - buffer, bufmax); - - memset(&aux, 0, sizeof(aux)); - aux.data_version = vnode->status.data_version; - aux.fid_unique = vnode->fid.unique; - - if (bufmax < sizeof(aux)) - return 0; - - memcpy(buffer, &aux, sizeof(aux)); - return sizeof(aux); -} - -/* * check that the auxiliary data indicates that the entry is still valid */ static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, const void *buffer, - uint16_t buflen) + uint16_t buflen, + loff_t object_size) { struct afs_vnode *vnode = cookie_netfs_data; struct afs_vnode_cache_aux aux; @@ -189,12 +61,6 @@ static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, return FSCACHE_CHECKAUX_OBSOLETE; } - if (vnode->fid.unique != aux.fid_unique) { - _leave(" = OBSOLETE [uniq %x != %x]", - aux.fid_unique, vnode->fid.unique); - return FSCACHE_CHECKAUX_OBSOLETE; - } - if (vnode->status.data_version != aux.data_version) { _leave(" = OBSOLETE [vers %llx != %llx]", aux.data_version, vnode->status.data_version); diff --git a/fs/afs/callback.c b/fs/afs/callback.c index f4291b576054..abd9a84f4e88 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -97,26 +97,6 @@ again: } /* - * Set a vnode's interest on a server. - */ -void afs_set_cb_interest(struct afs_vnode *vnode, struct afs_cb_interest *cbi) -{ - struct afs_cb_interest *old_cbi = NULL; - - if (vnode->cb_interest == cbi) - return; - - write_seqlock(&vnode->cb_lock); - if (vnode->cb_interest != cbi) { - afs_get_cb_interest(cbi); - old_cbi = vnode->cb_interest; - vnode->cb_interest = cbi; - } - write_sequnlock(&vnode->cb_lock); - afs_put_cb_interest(afs_v2net(vnode), cbi); -} - -/* * Remove an interest on a server. */ void afs_put_cb_interest(struct afs_net *net, struct afs_cb_interest *cbi) @@ -150,6 +130,7 @@ void afs_break_callback(struct afs_vnode *vnode) write_seqlock(&vnode->cb_lock); + clear_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { vnode->cb_break++; afs_clear_permits(vnode); @@ -207,7 +188,7 @@ static void afs_break_one_callback(struct afs_server *server, * allow the fileserver to break callback promises */ void afs_break_callbacks(struct afs_server *server, size_t count, - struct afs_callback callbacks[]) + struct afs_callback_break *callbacks) { _enter("%p,%zu,", server, count); @@ -219,9 +200,9 @@ void afs_break_callbacks(struct afs_server *server, size_t count, callbacks->fid.vid, callbacks->fid.vnode, callbacks->fid.unique, - callbacks->version, - callbacks->expiry, - callbacks->type + callbacks->cb.version, + callbacks->cb.expiry, + callbacks->cb.type ); afs_break_one_callback(server, &callbacks->fid); } diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 3d2c5e0e854e..fdf4c36cff79 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -18,7 +18,7 @@ #include <keys/rxrpc-type.h> #include "internal.h" -unsigned __read_mostly afs_cell_gc_delay = 10; +static unsigned __read_mostly afs_cell_gc_delay = 10; static void afs_manage_cell(struct work_struct *); @@ -75,7 +75,7 @@ struct afs_cell *afs_lookup_cell_rcu(struct afs_net *net, cell = rcu_dereference_raw(net->ws_cell); if (cell) { afs_get_cell(cell); - continue; + break; } ret = -EDESTADDRREQ; continue; @@ -130,6 +130,8 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, _leave(" = -ENAMETOOLONG"); return ERR_PTR(-ENAMETOOLONG); } + if (namelen == 5 && memcmp(name, "@cell", 5) == 0) + return ERR_PTR(-EINVAL); _enter("%*.*s,%s", namelen, namelen, name, vllist); @@ -334,8 +336,8 @@ int afs_cell_init(struct afs_net *net, const char *rootcell) return PTR_ERR(new_root); } - set_bit(AFS_CELL_FL_NO_GC, &new_root->flags); - afs_get_cell(new_root); + if (!test_and_set_bit(AFS_CELL_FL_NO_GC, &new_root->flags)) + afs_get_cell(new_root); /* install the new cell */ write_seqlock(&net->cells_lock); @@ -411,7 +413,7 @@ static void afs_cell_destroy(struct rcu_head *rcu) ASSERTCMP(atomic_read(&cell->usage), ==, 0); - afs_put_addrlist(cell->vl_addrs); + afs_put_addrlist(rcu_access_pointer(cell->vl_addrs)); key_put(cell->anonymous_key); kfree(cell); @@ -522,7 +524,9 @@ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell) #ifdef CONFIG_AFS_FSCACHE cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index, &afs_cell_cache_index_def, - cell, true); + cell->name, strlen(cell->name), + NULL, 0, + cell, 0, true); #endif ret = afs_proc_cell_setup(net, cell); if (ret < 0) @@ -547,7 +551,7 @@ static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell) spin_unlock(&net->proc_cells_lock); #ifdef CONFIG_AFS_FSCACHE - fscache_relinquish_cookie(cell->cache, 0); + fscache_relinquish_cookie(cell->cache, NULL, false); cell->cache = NULL; #endif diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 41e277f57b20..357de908df3a 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -178,8 +178,8 @@ static void SRXAFSCB_CallBack(struct work_struct *work) */ static int afs_deliver_cb_callback(struct afs_call *call) { + struct afs_callback_break *cb; struct sockaddr_rxrpc srx; - struct afs_callback *cb; struct afs_server *server; __be32 *bp; int ret, loop; @@ -201,7 +201,7 @@ static int afs_deliver_cb_callback(struct afs_call *call) call->count = ntohl(call->tmp); _debug("FID count: %u", call->count); if (call->count > AFSCBMAX) - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); call->buffer = kmalloc(call->count * 3 * 4, GFP_KERNEL); if (!call->buffer) @@ -218,7 +218,7 @@ static int afs_deliver_cb_callback(struct afs_call *call) _debug("unmarshall FID array"); call->request = kcalloc(call->count, - sizeof(struct afs_callback), + sizeof(struct afs_callback_break), GFP_KERNEL); if (!call->request) return -ENOMEM; @@ -229,7 +229,7 @@ static int afs_deliver_cb_callback(struct afs_call *call) cb->fid.vid = ntohl(*bp++); cb->fid.vnode = ntohl(*bp++); cb->fid.unique = ntohl(*bp++); - cb->type = AFSCM_CB_UNTYPED; + cb->cb.type = AFSCM_CB_UNTYPED; } call->offset = 0; @@ -245,7 +245,7 @@ static int afs_deliver_cb_callback(struct afs_call *call) call->count2 = ntohl(call->tmp); _debug("CB count: %u", call->count2); if (call->count2 != call->count && call->count2 != 0) - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); call->offset = 0; call->unmarshall++; @@ -260,9 +260,9 @@ static int afs_deliver_cb_callback(struct afs_call *call) cb = call->request; bp = call->buffer; for (loop = call->count2; loop > 0; loop--, cb++) { - cb->version = ntohl(*bp++); - cb->expiry = ntohl(*bp++); - cb->type = ntohl(*bp++); + cb->cb.version = ntohl(*bp++); + cb->cb.expiry = ntohl(*bp++); + cb->cb.type = ntohl(*bp++); } call->offset = 0; @@ -500,9 +500,9 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call) b = call->buffer; r = call->request; - r->time_low = ntohl(b[0]); - r->time_mid = ntohl(b[1]); - r->time_hi_and_version = ntohl(b[2]); + r->time_low = b[0]; + r->time_mid = htons(ntohl(b[1])); + r->time_hi_and_version = htons(ntohl(b[2])); r->clock_seq_hi_and_reserved = ntohl(b[3]); r->clock_seq_low = ntohl(b[4]); diff --git a/fs/afs/dir.c b/fs/afs/dir.c index ba2b458b36d1..5889f70d4d27 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -1,6 +1,6 @@ /* dir.c: AFS filesystem directory handling * - * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2002, 2018 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or @@ -10,27 +10,26 @@ */ #include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> #include <linux/fs.h> #include <linux/namei.h> #include <linux/pagemap.h> +#include <linux/swap.h> #include <linux/ctype.h> #include <linux/sched.h> -#include <linux/dns_resolver.h> +#include <linux/task_io_accounting_ops.h> #include "internal.h" +#include "xdr_fs.h" static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); -static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentry, - unsigned int flags); static int afs_dir_open(struct inode *inode, struct file *file); static int afs_readdir(struct file *file, struct dir_context *ctx); static int afs_d_revalidate(struct dentry *dentry, unsigned int flags); static int afs_d_delete(const struct dentry *dentry); -static void afs_d_release(struct dentry *dentry); -static int afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen, +static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name, int nlen, loff_t fpos, u64 ino, unsigned dtype); +static int afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen, + loff_t fpos, u64 ino, unsigned dtype); static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl); static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); @@ -43,6 +42,14 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry, static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); +static int afs_dir_releasepage(struct page *page, gfp_t gfp_flags); +static void afs_dir_invalidatepage(struct page *page, unsigned int offset, + unsigned int length); + +static int afs_dir_set_page_dirty(struct page *page) +{ + BUG(); /* This should never happen. */ +} const struct file_operations afs_dir_file_operations = { .open = afs_dir_open, @@ -67,15 +74,10 @@ const struct inode_operations afs_dir_inode_operations = { .listxattr = afs_listxattr, }; -const struct file_operations afs_dynroot_file_operations = { - .open = dcache_dir_open, - .release = dcache_dir_close, - .iterate_shared = dcache_readdir, - .llseek = dcache_dir_lseek, -}; - -const struct inode_operations afs_dynroot_inode_operations = { - .lookup = afs_dynroot_lookup, +const struct address_space_operations afs_dir_aops = { + .set_page_dirty = afs_dir_set_page_dirty, + .releasepage = afs_dir_releasepage, + .invalidatepage = afs_dir_invalidatepage, }; const struct dentry_operations afs_fs_dentry_operations = { @@ -85,91 +87,38 @@ const struct dentry_operations afs_fs_dentry_operations = { .d_automount = afs_d_automount, }; -#define AFS_DIR_HASHTBL_SIZE 128 -#define AFS_DIR_DIRENT_SIZE 32 -#define AFS_DIRENT_PER_BLOCK 64 - -union afs_dirent { - struct { - uint8_t valid; - uint8_t unused[1]; - __be16 hash_next; - __be32 vnode; - __be32 unique; - uint8_t name[16]; - uint8_t overflow[4]; /* if any char of the name (inc - * NUL) reaches here, consume - * the next dirent too */ - } u; - uint8_t extended_name[32]; -}; - -/* AFS directory page header (one at the beginning of every 2048-byte chunk) */ -struct afs_dir_pagehdr { - __be16 npages; - __be16 magic; -#define AFS_DIR_MAGIC htons(1234) - uint8_t nentries; - uint8_t bitmap[8]; - uint8_t pad[19]; -}; - -/* directory block layout */ -union afs_dir_block { - - struct afs_dir_pagehdr pagehdr; - - struct { - struct afs_dir_pagehdr pagehdr; - uint8_t alloc_ctrs[128]; - /* dir hash table */ - uint16_t hashtable[AFS_DIR_HASHTBL_SIZE]; - } hdr; - - union afs_dirent dirents[AFS_DIRENT_PER_BLOCK]; -}; - -/* layout on a linux VM page */ -struct afs_dir_page { - union afs_dir_block blocks[PAGE_SIZE / sizeof(union afs_dir_block)]; +struct afs_lookup_one_cookie { + struct dir_context ctx; + struct qstr name; + bool found; + struct afs_fid fid; }; struct afs_lookup_cookie { - struct dir_context ctx; - struct afs_fid fid; - struct qstr name; - int found; + struct dir_context ctx; + struct qstr name; + bool found; + bool one_only; + unsigned short nr_fids; + struct afs_file_status *statuses; + struct afs_callback *callbacks; + struct afs_fid fids[50]; }; /* * check that a directory page is valid */ -bool afs_dir_check_page(struct inode *dir, struct page *page) +static bool afs_dir_check_page(struct afs_vnode *dvnode, struct page *page, + loff_t i_size) { - struct afs_dir_page *dbuf; - struct afs_vnode *vnode = AFS_FS_I(dir); - loff_t latter, i_size, off; + struct afs_xdr_dir_page *dbuf; + loff_t latter, off; int tmp, qty; -#if 0 - /* check the page count */ - qty = desc.size / sizeof(dbuf->blocks[0]); - if (qty == 0) - goto error; - - if (page->index == 0 && qty != ntohs(dbuf->blocks[0].pagehdr.npages)) { - printk("kAFS: %s(%lu): wrong number of dir blocks %d!=%hu\n", - __func__, dir->i_ino, qty, - ntohs(dbuf->blocks[0].pagehdr.npages)); - goto error; - } -#endif - /* Determine how many magic numbers there should be in this page, but * we must take care because the directory may change size under us. */ off = page_offset(page); - i_size = i_size_read(dir); if (i_size <= off) goto checked; @@ -178,112 +127,225 @@ bool afs_dir_check_page(struct inode *dir, struct page *page) qty = PAGE_SIZE; else qty = latter; - qty /= sizeof(union afs_dir_block); + qty /= sizeof(union afs_xdr_dir_block); /* check them */ - dbuf = page_address(page); + dbuf = kmap(page); for (tmp = 0; tmp < qty; tmp++) { - if (dbuf->blocks[tmp].pagehdr.magic != AFS_DIR_MAGIC) { + if (dbuf->blocks[tmp].hdr.magic != AFS_DIR_MAGIC) { printk("kAFS: %s(%lx): bad magic %d/%d is %04hx\n", - __func__, dir->i_ino, tmp, qty, - ntohs(dbuf->blocks[tmp].pagehdr.magic)); - trace_afs_dir_check_failed(vnode, off, i_size); + __func__, dvnode->vfs_inode.i_ino, tmp, qty, + ntohs(dbuf->blocks[tmp].hdr.magic)); + trace_afs_dir_check_failed(dvnode, off, i_size); + kunmap(page); goto error; } + + /* Make sure each block is NUL terminated so we can reasonably + * use string functions on it. The filenames in the page + * *should* be NUL-terminated anyway. + */ + ((u8 *)&dbuf->blocks[tmp])[AFS_DIR_BLOCK_SIZE - 1] = 0; } + kunmap(page); + checked: - SetPageChecked(page); + afs_stat_v(dvnode, n_read_dir); return true; error: - SetPageError(page); return false; } /* - * discard a page cached in the pagecache + * open an AFS directory file */ -static inline void afs_dir_put_page(struct page *page) +static int afs_dir_open(struct inode *inode, struct file *file) { - kunmap(page); - unlock_page(page); - put_page(page); + _enter("{%lu}", inode->i_ino); + + BUILD_BUG_ON(sizeof(union afs_xdr_dir_block) != 2048); + BUILD_BUG_ON(sizeof(union afs_xdr_dirent) != 32); + + if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(inode)->flags)) + return -ENOENT; + + return afs_open(inode, file); } /* - * get a page into the pagecache + * Read the directory into the pagecache in one go, scrubbing the previous + * contents. The list of pages is returned, pinning them so that they don't + * get reclaimed during the iteration. */ -static struct page *afs_dir_get_page(struct inode *dir, unsigned long index, - struct key *key) +static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key) { - struct page *page; - _enter("{%lu},%lu", dir->i_ino, index); - - page = read_cache_page(dir->i_mapping, index, afs_page_filler, key); - if (!IS_ERR(page)) { - lock_page(page); - kmap(page); - if (unlikely(!PageChecked(page))) { - if (PageError(page)) - goto fail; - } + struct afs_read *req; + loff_t i_size; + int nr_pages, nr_inline, i, n; + int ret = -ENOMEM; + +retry: + i_size = i_size_read(&dvnode->vfs_inode); + if (i_size < 2048) + return ERR_PTR(-EIO); + if (i_size > 2048 * 1024) + return ERR_PTR(-EFBIG); + + _enter("%llu", i_size); + + /* Get a request record to hold the page list. We want to hold it + * inline if we can, but we don't want to make an order 1 allocation. + */ + nr_pages = (i_size + PAGE_SIZE - 1) / PAGE_SIZE; + nr_inline = nr_pages; + if (nr_inline > (PAGE_SIZE - sizeof(*req)) / sizeof(struct page *)) + nr_inline = 0; + + req = kzalloc(sizeof(*req) + sizeof(struct page *) * nr_inline, + GFP_KERNEL); + if (!req) + return ERR_PTR(-ENOMEM); + + refcount_set(&req->usage, 1); + req->nr_pages = nr_pages; + req->actual_len = i_size; /* May change */ + req->len = nr_pages * PAGE_SIZE; /* We can ask for more than there is */ + req->data_version = dvnode->status.data_version; /* May change */ + if (nr_inline > 0) { + req->pages = req->array; + } else { + req->pages = kcalloc(nr_pages, sizeof(struct page *), + GFP_KERNEL); + if (!req->pages) + goto error; } - return page; -fail: - afs_dir_put_page(page); - _leave(" = -EIO"); - return ERR_PTR(-EIO); -} + /* Get a list of all the pages that hold or will hold the directory + * content. We need to fill in any gaps that we might find where the + * memory reclaimer has been at work. If there are any gaps, we will + * need to reread the entire directory contents. + */ + i = 0; + do { + n = find_get_pages_contig(dvnode->vfs_inode.i_mapping, i, + req->nr_pages - i, + req->pages + i); + _debug("find %u at %u/%u", n, i, req->nr_pages); + if (n == 0) { + gfp_t gfp = dvnode->vfs_inode.i_mapping->gfp_mask; + + if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + afs_stat_v(dvnode, n_inval); + + ret = -ENOMEM; + req->pages[i] = __page_cache_alloc(gfp); + if (!req->pages[i]) + goto error; + ret = add_to_page_cache_lru(req->pages[i], + dvnode->vfs_inode.i_mapping, + i, gfp); + if (ret < 0) + goto error; + + set_page_private(req->pages[i], 1); + SetPagePrivate(req->pages[i]); + unlock_page(req->pages[i]); + i++; + } else { + i += n; + } + } while (i < req->nr_pages); -/* - * open an AFS directory file - */ -static int afs_dir_open(struct inode *inode, struct file *file) -{ - _enter("{%lu}", inode->i_ino); + /* If we're going to reload, we need to lock all the pages to prevent + * races. + */ + if (!test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) { + ret = -ERESTARTSYS; + for (i = 0; i < req->nr_pages; i++) + if (lock_page_killable(req->pages[i]) < 0) + goto error_unlock; - BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048); - BUILD_BUG_ON(sizeof(union afs_dirent) != 32); + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + goto success; - if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(inode)->flags)) - return -ENOENT; + ret = afs_fetch_data(dvnode, key, req); + if (ret < 0) + goto error_unlock_all; - return afs_open(inode, file); + task_io_account_read(PAGE_SIZE * req->nr_pages); + + if (req->len < req->file_size) + goto content_has_grown; + + /* Validate the data we just read. */ + ret = -EIO; + for (i = 0; i < req->nr_pages; i++) + if (!afs_dir_check_page(dvnode, req->pages[i], + req->actual_len)) + goto error_unlock_all; + + // TODO: Trim excess pages + + set_bit(AFS_VNODE_DIR_VALID, &dvnode->flags); + } + +success: + i = req->nr_pages; + while (i > 0) + unlock_page(req->pages[--i]); + return req; + +error_unlock_all: + i = req->nr_pages; +error_unlock: + while (i > 0) + unlock_page(req->pages[--i]); +error: + afs_put_read(req); + _leave(" = %d", ret); + return ERR_PTR(ret); + +content_has_grown: + i = req->nr_pages; + while (i > 0) + unlock_page(req->pages[--i]); + afs_put_read(req); + goto retry; } /* * deal with one block in an AFS directory */ static int afs_dir_iterate_block(struct dir_context *ctx, - union afs_dir_block *block, + union afs_xdr_dir_block *block, unsigned blkoff) { - union afs_dirent *dire; + union afs_xdr_dirent *dire; unsigned offset, next, curr; size_t nlen; int tmp; _enter("%u,%x,%p,,",(unsigned)ctx->pos,blkoff,block); - curr = (ctx->pos - blkoff) / sizeof(union afs_dirent); + curr = (ctx->pos - blkoff) / sizeof(union afs_xdr_dirent); /* walk through the block, an entry at a time */ - for (offset = AFS_DIRENT_PER_BLOCK - block->pagehdr.nentries; - offset < AFS_DIRENT_PER_BLOCK; + for (offset = (blkoff == 0 ? AFS_DIR_RESV_BLOCKS0 : AFS_DIR_RESV_BLOCKS); + offset < AFS_DIR_SLOTS_PER_BLOCK; offset = next ) { next = offset + 1; /* skip entries marked unused in the bitmap */ - if (!(block->pagehdr.bitmap[offset / 8] & + if (!(block->hdr.bitmap[offset / 8] & (1 << (offset % 8)))) { _debug("ENT[%zu.%u]: unused", - blkoff / sizeof(union afs_dir_block), offset); + blkoff / sizeof(union afs_xdr_dir_block), offset); if (offset >= curr) ctx->pos = blkoff + - next * sizeof(union afs_dirent); + next * sizeof(union afs_xdr_dirent); continue; } @@ -291,34 +353,34 @@ static int afs_dir_iterate_block(struct dir_context *ctx, dire = &block->dirents[offset]; nlen = strnlen(dire->u.name, sizeof(*block) - - offset * sizeof(union afs_dirent)); + offset * sizeof(union afs_xdr_dirent)); _debug("ENT[%zu.%u]: %s %zu \"%s\"", - blkoff / sizeof(union afs_dir_block), offset, + blkoff / sizeof(union afs_xdr_dir_block), offset, (offset < curr ? "skip" : "fill"), nlen, dire->u.name); /* work out where the next possible entry is */ - for (tmp = nlen; tmp > 15; tmp -= sizeof(union afs_dirent)) { - if (next >= AFS_DIRENT_PER_BLOCK) { + for (tmp = nlen; tmp > 15; tmp -= sizeof(union afs_xdr_dirent)) { + if (next >= AFS_DIR_SLOTS_PER_BLOCK) { _debug("ENT[%zu.%u]:" " %u travelled beyond end dir block" " (len %u/%zu)", - blkoff / sizeof(union afs_dir_block), + blkoff / sizeof(union afs_xdr_dir_block), offset, next, tmp, nlen); return -EIO; } - if (!(block->pagehdr.bitmap[next / 8] & + if (!(block->hdr.bitmap[next / 8] & (1 << (next % 8)))) { _debug("ENT[%zu.%u]:" " %u unmarked extension (len %u/%zu)", - blkoff / sizeof(union afs_dir_block), + blkoff / sizeof(union afs_xdr_dir_block), offset, next, tmp, nlen); return -EIO; } _debug("ENT[%zu.%u]: ext %u/%zu", - blkoff / sizeof(union afs_dir_block), + blkoff / sizeof(union afs_xdr_dir_block), next, tmp, nlen); next++; } @@ -330,13 +392,14 @@ static int afs_dir_iterate_block(struct dir_context *ctx, /* found the next entry */ if (!dir_emit(ctx, dire->u.name, nlen, ntohl(dire->u.vnode), - ctx->actor == afs_lookup_filldir ? + (ctx->actor == afs_lookup_filldir || + ctx->actor == afs_lookup_one_filldir)? ntohl(dire->u.unique) : DT_UNKNOWN)) { _leave(" = 0 [full]"); return 0; } - ctx->pos = blkoff + next * sizeof(union afs_dirent); + ctx->pos = blkoff + next * sizeof(union afs_xdr_dirent); } _leave(" = 1 [more]"); @@ -349,8 +412,10 @@ static int afs_dir_iterate_block(struct dir_context *ctx, static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, struct key *key) { - union afs_dir_block *dblock; - struct afs_dir_page *dbuf; + struct afs_vnode *dvnode = AFS_FS_I(dir); + struct afs_xdr_dir_page *dbuf; + union afs_xdr_dir_block *dblock; + struct afs_read *req; struct page *page; unsigned blkoff, limit; int ret; @@ -362,45 +427,53 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, return -ESTALE; } + req = afs_read_dir(dvnode, key); + if (IS_ERR(req)) + return PTR_ERR(req); + /* round the file position up to the next entry boundary */ - ctx->pos += sizeof(union afs_dirent) - 1; - ctx->pos &= ~(sizeof(union afs_dirent) - 1); + ctx->pos += sizeof(union afs_xdr_dirent) - 1; + ctx->pos &= ~(sizeof(union afs_xdr_dirent) - 1); /* walk through the blocks in sequence */ ret = 0; - while (ctx->pos < dir->i_size) { - blkoff = ctx->pos & ~(sizeof(union afs_dir_block) - 1); + while (ctx->pos < req->actual_len) { + blkoff = ctx->pos & ~(sizeof(union afs_xdr_dir_block) - 1); - /* fetch the appropriate page from the directory */ - page = afs_dir_get_page(dir, blkoff / PAGE_SIZE, key); - if (IS_ERR(page)) { - ret = PTR_ERR(page); + /* Fetch the appropriate page from the directory and re-add it + * to the LRU. + */ + page = req->pages[blkoff / PAGE_SIZE]; + if (!page) { + ret = -EIO; break; } + mark_page_accessed(page); limit = blkoff & ~(PAGE_SIZE - 1); - dbuf = page_address(page); + dbuf = kmap(page); /* deal with the individual blocks stashed on this page */ do { dblock = &dbuf->blocks[(blkoff % PAGE_SIZE) / - sizeof(union afs_dir_block)]; + sizeof(union afs_xdr_dir_block)]; ret = afs_dir_iterate_block(ctx, dblock, blkoff); if (ret != 1) { - afs_dir_put_page(page); + kunmap(page); goto out; } - blkoff += sizeof(union afs_dir_block); + blkoff += sizeof(union afs_xdr_dir_block); } while (ctx->pos < dir->i_size && blkoff < limit); - afs_dir_put_page(page); + kunmap(page); ret = 0; } out: + afs_put_read(req); _leave(" = %d", ret); return ret; } @@ -414,23 +487,23 @@ static int afs_readdir(struct file *file, struct dir_context *ctx) } /* - * search the directory for a name + * Search the directory for a single name * - if afs_dir_iterate_block() spots this function, it'll pass the FID * uniquifier through dtype */ -static int afs_lookup_filldir(struct dir_context *ctx, const char *name, - int nlen, loff_t fpos, u64 ino, unsigned dtype) +static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name, + int nlen, loff_t fpos, u64 ino, unsigned dtype) { - struct afs_lookup_cookie *cookie = - container_of(ctx, struct afs_lookup_cookie, ctx); + struct afs_lookup_one_cookie *cookie = + container_of(ctx, struct afs_lookup_one_cookie, ctx); _enter("{%s,%u},%s,%u,,%llu,%u", cookie->name.name, cookie->name.len, name, nlen, (unsigned long long) ino, dtype); /* insanity checks first */ - BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048); - BUILD_BUG_ON(sizeof(union afs_dirent) != 32); + BUILD_BUG_ON(sizeof(union afs_xdr_dir_block) != 2048); + BUILD_BUG_ON(sizeof(union afs_xdr_dirent) != 32); if (cookie->name.len != nlen || memcmp(cookie->name.name, name, nlen) != 0) { @@ -447,15 +520,15 @@ static int afs_lookup_filldir(struct dir_context *ctx, const char *name, } /* - * do a lookup in a directory + * Do a lookup of a single name in a directory * - just returns the FID the dentry name maps to if found */ -static int afs_do_lookup(struct inode *dir, struct dentry *dentry, - struct afs_fid *fid, struct key *key) +static int afs_do_lookup_one(struct inode *dir, struct dentry *dentry, + struct afs_fid *fid, struct key *key) { struct afs_super_info *as = dir->i_sb->s_fs_info; - struct afs_lookup_cookie cookie = { - .ctx.actor = afs_lookup_filldir, + struct afs_lookup_one_cookie cookie = { + .ctx.actor = afs_lookup_one_filldir, .name = dentry->d_name, .fid.vid = as->volume->vid }; @@ -482,70 +555,265 @@ static int afs_do_lookup(struct inode *dir, struct dentry *dentry, } /* - * Probe to see if a cell may exist. This prevents positive dentries from - * being created unnecessarily. + * search the directory for a name + * - if afs_dir_iterate_block() spots this function, it'll pass the FID + * uniquifier through dtype */ -static int afs_probe_cell_name(struct dentry *dentry) +static int afs_lookup_filldir(struct dir_context *ctx, const char *name, + int nlen, loff_t fpos, u64 ino, unsigned dtype) { - struct afs_cell *cell; - const char *name = dentry->d_name.name; - size_t len = dentry->d_name.len; + struct afs_lookup_cookie *cookie = + container_of(ctx, struct afs_lookup_cookie, ctx); int ret; - /* Names prefixed with a dot are R/W mounts. */ - if (name[0] == '.') { - if (len == 1) - return -EINVAL; - name++; - len--; - } + _enter("{%s,%u},%s,%u,,%llu,%u", + cookie->name.name, cookie->name.len, name, nlen, + (unsigned long long) ino, dtype); - cell = afs_lookup_cell_rcu(afs_d2net(dentry), name, len); - if (!IS_ERR(cell)) { - afs_put_cell(afs_d2net(dentry), cell); - return 0; + /* insanity checks first */ + BUILD_BUG_ON(sizeof(union afs_xdr_dir_block) != 2048); + BUILD_BUG_ON(sizeof(union afs_xdr_dirent) != 32); + + if (cookie->found) { + if (cookie->nr_fids < 50) { + cookie->fids[cookie->nr_fids].vnode = ino; + cookie->fids[cookie->nr_fids].unique = dtype; + cookie->nr_fids++; + } + } else if (cookie->name.len == nlen && + memcmp(cookie->name.name, name, nlen) == 0) { + cookie->fids[0].vnode = ino; + cookie->fids[0].unique = dtype; + cookie->found = 1; + if (cookie->one_only) + return -1; } - ret = dns_query("afsdb", name, len, "ipv4", NULL, NULL); - if (ret == -ENODATA) - ret = -EDESTADDRREQ; + ret = cookie->nr_fids >= 50 ? -1 : 0; + _leave(" = %d", ret); return ret; } /* - * Try to auto mount the mountpoint with pseudo directory, if the autocell - * operation is setted. + * Do a lookup in a directory. We make use of bulk lookup to query a slew of + * files in one go and create inodes for them. The inode of the file we were + * asked for is returned. */ -static struct inode *afs_try_auto_mntpt(struct dentry *dentry, - struct inode *dir, struct afs_fid *fid) +static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, + struct key *key) { - struct afs_vnode *vnode = AFS_FS_I(dir); - struct inode *inode; - int ret = -ENOENT; + struct afs_lookup_cookie *cookie; + struct afs_cb_interest *cbi = NULL; + struct afs_super_info *as = dir->i_sb->s_fs_info; + struct afs_iget_data data; + struct afs_fs_cursor fc; + struct afs_vnode *dvnode = AFS_FS_I(dir); + struct inode *inode = NULL; + int ret, i; - _enter("%p{%pd}, {%x:%u}", - dentry, dentry, vnode->fid.vid, vnode->fid.vnode); + _enter("{%lu},%p{%pd},", dir->i_ino, dentry, dentry); + + cookie = kzalloc(sizeof(struct afs_lookup_cookie), GFP_KERNEL); + if (!cookie) + return ERR_PTR(-ENOMEM); + + cookie->ctx.actor = afs_lookup_filldir; + cookie->name = dentry->d_name; + cookie->nr_fids = 1; /* slot 0 is saved for the fid we actually want */ + + read_seqlock_excl(&dvnode->cb_lock); + if (dvnode->cb_interest && + dvnode->cb_interest->server && + test_bit(AFS_SERVER_FL_NO_IBULK, &dvnode->cb_interest->server->flags)) + cookie->one_only = true; + read_sequnlock_excl(&dvnode->cb_lock); + + for (i = 0; i < 50; i++) + cookie->fids[i].vid = as->volume->vid; + + /* search the directory */ + ret = afs_dir_iterate(dir, &cookie->ctx, key); + if (ret < 0) { + inode = ERR_PTR(ret); + goto out; + } - if (!test_bit(AFS_VNODE_AUTOCELL, &vnode->flags)) + inode = ERR_PTR(-ENOENT); + if (!cookie->found) goto out; - ret = afs_probe_cell_name(dentry); - if (ret < 0) + /* Check to see if we already have an inode for the primary fid. */ + data.volume = dvnode->volume; + data.fid = cookie->fids[0]; + inode = ilookup5(dir->i_sb, cookie->fids[0].vnode, afs_iget5_test, &data); + if (inode) goto out; - inode = afs_iget_pseudo_dir(dir->i_sb, false); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); + /* Need space for examining all the selected files */ + inode = ERR_PTR(-ENOMEM); + cookie->statuses = kcalloc(cookie->nr_fids, sizeof(struct afs_file_status), + GFP_KERNEL); + if (!cookie->statuses) goto out; + + cookie->callbacks = kcalloc(cookie->nr_fids, sizeof(struct afs_callback), + GFP_KERNEL); + if (!cookie->callbacks) + goto out_s; + + /* Try FS.InlineBulkStatus first. Abort codes for the individual + * lookups contained therein are stored in the reply without aborting + * the whole operation. + */ + if (cookie->one_only) + goto no_inline_bulk_status; + + inode = ERR_PTR(-ERESTARTSYS); + if (afs_begin_vnode_operation(&fc, dvnode, key)) { + while (afs_select_fileserver(&fc)) { + if (test_bit(AFS_SERVER_FL_NO_IBULK, + &fc.cbi->server->flags)) { + fc.ac.abort_code = RX_INVALID_OPERATION; + fc.ac.error = -ECONNABORTED; + break; + } + afs_fs_inline_bulk_status(&fc, + afs_v2net(dvnode), + cookie->fids, + cookie->statuses, + cookie->callbacks, + cookie->nr_fids, NULL); + } + + if (fc.ac.error == 0) + cbi = afs_get_cb_interest(fc.cbi); + if (fc.ac.abort_code == RX_INVALID_OPERATION) + set_bit(AFS_SERVER_FL_NO_IBULK, &fc.cbi->server->flags); + inode = ERR_PTR(afs_end_vnode_operation(&fc)); } - *fid = AFS_FS_I(inode)->fid; - _leave("= %p", inode); - return inode; + if (!IS_ERR(inode)) + goto success; + if (fc.ac.abort_code != RX_INVALID_OPERATION) + goto out_c; + +no_inline_bulk_status: + /* We could try FS.BulkStatus next, but this aborts the entire op if + * any of the lookups fails - so, for the moment, revert to + * FS.FetchStatus for just the primary fid. + */ + cookie->nr_fids = 1; + inode = ERR_PTR(-ERESTARTSYS); + if (afs_begin_vnode_operation(&fc, dvnode, key)) { + while (afs_select_fileserver(&fc)) { + afs_fs_fetch_status(&fc, + afs_v2net(dvnode), + cookie->fids, + cookie->statuses, + cookie->callbacks, + NULL); + } + + if (fc.ac.error == 0) + cbi = afs_get_cb_interest(fc.cbi); + inode = ERR_PTR(afs_end_vnode_operation(&fc)); + } + if (IS_ERR(inode)) + goto out_c; + + for (i = 0; i < cookie->nr_fids; i++) + cookie->statuses[i].abort_code = 0; + +success: + /* Turn all the files into inodes and save the first one - which is the + * one we actually want. + */ + if (cookie->statuses[0].abort_code != 0) + inode = ERR_PTR(afs_abort_to_error(cookie->statuses[0].abort_code)); + + for (i = 0; i < cookie->nr_fids; i++) { + struct inode *ti; + + if (cookie->statuses[i].abort_code != 0) + continue; + + ti = afs_iget(dir->i_sb, key, &cookie->fids[i], + &cookie->statuses[i], + &cookie->callbacks[i], + cbi); + if (i == 0) { + inode = ti; + } else { + if (!IS_ERR(ti)) + iput(ti); + } + } + +out_c: + afs_put_cb_interest(afs_v2net(dvnode), cbi); + kfree(cookie->callbacks); +out_s: + kfree(cookie->statuses); out: - _leave("= %d", ret); - return ERR_PTR(ret); + kfree(cookie); + return inode; +} + +/* + * Look up an entry in a directory with @sys substitution. + */ +static struct dentry *afs_lookup_atsys(struct inode *dir, struct dentry *dentry, + struct key *key) +{ + struct afs_sysnames *subs; + struct afs_net *net = afs_i2net(dir); + struct dentry *ret; + char *buf, *p, *name; + int len, i; + + _enter(""); + + ret = ERR_PTR(-ENOMEM); + p = buf = kmalloc(AFSNAMEMAX, GFP_KERNEL); + if (!buf) + goto out_p; + if (dentry->d_name.len > 4) { + memcpy(p, dentry->d_name.name, dentry->d_name.len - 4); + p += dentry->d_name.len - 4; + } + + /* There is an ordered list of substitutes that we have to try. */ + read_lock(&net->sysnames_lock); + subs = net->sysnames; + refcount_inc(&subs->usage); + read_unlock(&net->sysnames_lock); + + for (i = 0; i < subs->nr; i++) { + name = subs->subs[i]; + len = dentry->d_name.len - 4 + strlen(name); + if (len >= AFSNAMEMAX) { + ret = ERR_PTR(-ENAMETOOLONG); + goto out_s; + } + + strcpy(p, name); + ret = lookup_one_len(buf, dentry->d_parent, len); + if (IS_ERR(ret) || d_is_positive(ret)) + goto out_s; + dput(ret); + } + + /* We don't want to d_add() the @sys dentry here as we don't want to + * the cached dentry to hide changes to the sysnames list. + */ + ret = NULL; +out_s: + afs_put_sysnames(subs); + kfree(buf); +out_p: + key_put(key); + return ret; } /* @@ -554,16 +822,13 @@ out: static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - struct afs_vnode *vnode; - struct afs_fid fid; + struct afs_vnode *dvnode = AFS_FS_I(dir); struct inode *inode; struct key *key; int ret; - vnode = AFS_FS_I(dir); - _enter("{%x:%u},%p{%pd},", - vnode->fid.vid, vnode->fid.vnode, dentry, dentry); + dvnode->fid.vid, dvnode->fid.vnode, dentry, dentry); ASSERTCMP(d_inode(dentry), ==, NULL); @@ -572,28 +837,37 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(-ENAMETOOLONG); } - if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { + if (test_bit(AFS_VNODE_DELETED, &dvnode->flags)) { _leave(" = -ESTALE"); return ERR_PTR(-ESTALE); } - key = afs_request_key(vnode->volume->cell); + key = afs_request_key(dvnode->volume->cell); if (IS_ERR(key)) { _leave(" = %ld [key]", PTR_ERR(key)); return ERR_CAST(key); } - ret = afs_validate(vnode, key); + ret = afs_validate(dvnode, key); if (ret < 0) { key_put(key); _leave(" = %d [val]", ret); return ERR_PTR(ret); } - ret = afs_do_lookup(dir, dentry, &fid, key); - if (ret < 0) { + if (dentry->d_name.len >= 4 && + dentry->d_name.name[dentry->d_name.len - 4] == '@' && + dentry->d_name.name[dentry->d_name.len - 3] == 's' && + dentry->d_name.name[dentry->d_name.len - 2] == 'y' && + dentry->d_name.name[dentry->d_name.len - 1] == 's') + return afs_lookup_atsys(dir, dentry, key); + + afs_stat_v(dvnode, n_lookup); + inode = afs_do_lookup(dir, dentry, key); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); if (ret == -ENOENT) { - inode = afs_try_auto_mntpt(dentry, dir, &fid); + inode = afs_try_auto_mntpt(dentry, dir); if (!IS_ERR(inode)) { key_put(key); goto success; @@ -611,10 +885,9 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, _leave(" = %d [do]", ret); return ERR_PTR(ret); } - dentry->d_fsdata = (void *)(unsigned long) vnode->status.data_version; + dentry->d_fsdata = (void *)(unsigned long)dvnode->status.data_version; /* instantiate the dentry */ - inode = afs_iget(dir->i_sb, key, &fid, NULL, NULL, NULL); key_put(key); if (IS_ERR(inode)) { _leave(" = %ld", PTR_ERR(inode)); @@ -623,9 +896,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, success: d_add(dentry, inode); - _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%u }", - fid.vnode, - fid.unique, + _leave(" = 0 { ino=%lu v=%u }", d_inode(dentry)->i_ino, d_inode(dentry)->i_generation); @@ -633,67 +904,23 @@ success: } /* - * Look up an entry in a dynroot directory. - */ -static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentry, - unsigned int flags) -{ - struct afs_vnode *vnode; - struct afs_fid fid; - struct inode *inode; - int ret; - - vnode = AFS_FS_I(dir); - - _enter("%pd", dentry); - - ASSERTCMP(d_inode(dentry), ==, NULL); - - if (dentry->d_name.len >= AFSNAMEMAX) { - _leave(" = -ENAMETOOLONG"); - return ERR_PTR(-ENAMETOOLONG); - } - - inode = afs_try_auto_mntpt(dentry, dir, &fid); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); - if (ret == -ENOENT) { - d_add(dentry, NULL); - _leave(" = NULL [negative]"); - return NULL; - } - _leave(" = %d [do]", ret); - return ERR_PTR(ret); - } - - d_add(dentry, inode); - _leave(" = 0 { ino=%lu v=%u }", - d_inode(dentry)->i_ino, d_inode(dentry)->i_generation); - return NULL; -} - -/* * check that a dentry lookup hit has found a valid entry * - NOTE! the hit can be a negative hit too, so we can't assume we have an * inode */ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) { - struct afs_super_info *as = dentry->d_sb->s_fs_info; struct afs_vnode *vnode, *dir; struct afs_fid uninitialized_var(fid); struct dentry *parent; struct inode *inode; struct key *key; - void *dir_version; + long dir_version, de_version; int ret; if (flags & LOOKUP_RCU) return -ECHILD; - if (as->dyn_root) - return 1; - if (d_really_is_positive(dentry)) { vnode = AFS_FS_I(d_inode(dentry)); _enter("{v={%x:%u} n=%pd fl=%lx},", @@ -729,14 +956,25 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) goto out_bad_parent; } - dir_version = (void *) (unsigned long) dir->status.data_version; - if (dentry->d_fsdata == dir_version) - goto out_valid; /* the dir contents are unchanged */ + /* We only need to invalidate a dentry if the server's copy changed + * behind our back. If we made the change, it's no problem. Note that + * on a 32-bit system, we only have 32 bits in the dentry to store the + * version. + */ + dir_version = (long)dir->status.data_version; + de_version = (long)dentry->d_fsdata; + if (de_version == dir_version) + goto out_valid; + + dir_version = (long)dir->invalid_before; + if (de_version - dir_version >= 0) + goto out_valid; _debug("dir modified"); + afs_stat_v(dir, n_reval); /* search the directory for this vnode */ - ret = afs_do_lookup(&dir->vfs_inode, dentry, &fid, key); + ret = afs_do_lookup_one(&dir->vfs_inode, dentry, &fid, key); switch (ret) { case 0: /* the filename maps to something */ @@ -789,7 +1027,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) } out_valid: - dentry->d_fsdata = dir_version; + dentry->d_fsdata = (void *)dir_version; dput(parent); key_put(key); _leave(" = 1 [valid]"); @@ -840,7 +1078,7 @@ zap: /* * handle dentry release */ -static void afs_d_release(struct dentry *dentry) +void afs_d_release(struct dentry *dentry) { _enter("%pd", dentry); } @@ -854,6 +1092,7 @@ static void afs_vnode_new_inode(struct afs_fs_cursor *fc, struct afs_file_status *newstatus, struct afs_callback *newcb) { + struct afs_vnode *vnode; struct inode *inode; if (fc->ac.error < 0) @@ -871,6 +1110,8 @@ static void afs_vnode_new_inode(struct afs_fs_cursor *fc, return; } + vnode = AFS_FS_I(inode); + set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); d_add(new_dentry, inode); } @@ -885,6 +1126,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct afs_vnode *dvnode = AFS_FS_I(dir); struct afs_fid newfid; struct key *key; + u64 data_version = dvnode->status.data_version; int ret; mode |= S_IFDIR; @@ -902,7 +1144,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (afs_begin_vnode_operation(&fc, dvnode, key)) { while (afs_select_fileserver(&fc)) { fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; - afs_fs_create(&fc, dentry->d_name.name, mode, + afs_fs_create(&fc, dentry->d_name.name, mode, data_version, &newfid, &newstatus, &newcb); } @@ -916,6 +1158,11 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) goto error_key; } + if (ret == 0 && + test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + afs_edit_dir_add(dvnode, &dentry->d_name, &newfid, + afs_edit_dir_for_create); + key_put(key); _leave(" = 0"); return 0; @@ -939,6 +1186,7 @@ static void afs_dir_remove_subdir(struct dentry *dentry) clear_nlink(&vnode->vfs_inode); set_bit(AFS_VNODE_DELETED, &vnode->flags); clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); } } @@ -950,6 +1198,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) struct afs_fs_cursor fc; struct afs_vnode *dvnode = AFS_FS_I(dir); struct key *key; + u64 data_version = dvnode->status.data_version; int ret; _enter("{%x:%u},{%pd}", @@ -965,13 +1214,18 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) if (afs_begin_vnode_operation(&fc, dvnode, key)) { while (afs_select_fileserver(&fc)) { fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; - afs_fs_remove(&fc, dentry->d_name.name, true); + afs_fs_remove(&fc, dentry->d_name.name, true, + data_version); } afs_vnode_commit_status(&fc, dvnode, fc.cb_break); ret = afs_end_vnode_operation(&fc); - if (ret == 0) + if (ret == 0) { afs_dir_remove_subdir(dentry); + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + afs_edit_dir_remove(dvnode, &dentry->d_name, + afs_edit_dir_for_rmdir); + } } key_put(key); @@ -1036,6 +1290,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode; struct key *key; unsigned long d_version = (unsigned long)dentry->d_fsdata; + u64 data_version = dvnode->status.data_version; int ret; _enter("{%x:%u},{%pd}", @@ -1062,7 +1317,8 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) if (afs_begin_vnode_operation(&fc, dvnode, key)) { while (afs_select_fileserver(&fc)) { fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; - afs_fs_remove(&fc, dentry->d_name.name, false); + afs_fs_remove(&fc, dentry->d_name.name, false, + data_version); } afs_vnode_commit_status(&fc, dvnode, fc.cb_break); @@ -1071,6 +1327,10 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) ret = afs_dir_remove_link( dentry, key, d_version, (unsigned long)dvnode->status.data_version); + if (ret == 0 && + test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + afs_edit_dir_remove(dvnode, &dentry->d_name, + afs_edit_dir_for_unlink); } error_key: @@ -1092,6 +1352,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct afs_vnode *dvnode = AFS_FS_I(dir); struct afs_fid newfid; struct key *key; + u64 data_version = dvnode->status.data_version; int ret; mode |= S_IFREG; @@ -1113,7 +1374,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, if (afs_begin_vnode_operation(&fc, dvnode, key)) { while (afs_select_fileserver(&fc)) { fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; - afs_fs_create(&fc, dentry->d_name.name, mode, + afs_fs_create(&fc, dentry->d_name.name, mode, data_version, &newfid, &newstatus, &newcb); } @@ -1127,6 +1388,10 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, goto error_key; } + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + afs_edit_dir_add(dvnode, &dentry->d_name, &newfid, + afs_edit_dir_for_create); + key_put(key); _leave(" = 0"); return 0; @@ -1148,10 +1413,12 @@ static int afs_link(struct dentry *from, struct inode *dir, struct afs_fs_cursor fc; struct afs_vnode *dvnode, *vnode; struct key *key; + u64 data_version; int ret; vnode = AFS_FS_I(d_inode(from)); dvnode = AFS_FS_I(dir); + data_version = dvnode->status.data_version; _enter("{%x:%u},{%x:%u},{%pd}", vnode->fid.vid, vnode->fid.vnode, @@ -1178,7 +1445,7 @@ static int afs_link(struct dentry *from, struct inode *dir, while (afs_select_fileserver(&fc)) { fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; fc.cb_break_2 = vnode->cb_break + vnode->cb_s_break; - afs_fs_link(&fc, vnode, dentry->d_name.name); + afs_fs_link(&fc, vnode, dentry->d_name.name, data_version); } afs_vnode_commit_status(&fc, dvnode, fc.cb_break); @@ -1194,6 +1461,10 @@ static int afs_link(struct dentry *from, struct inode *dir, goto error_key; } + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + afs_edit_dir_add(dvnode, &dentry->d_name, &vnode->fid, + afs_edit_dir_for_link); + key_put(key); _leave(" = 0"); return 0; @@ -1217,6 +1488,7 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry, struct afs_vnode *dvnode = AFS_FS_I(dir); struct afs_fid newfid; struct key *key; + u64 data_version = dvnode->status.data_version; int ret; _enter("{%x:%u},{%pd},%s", @@ -1241,7 +1513,8 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry, if (afs_begin_vnode_operation(&fc, dvnode, key)) { while (afs_select_fileserver(&fc)) { fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; - afs_fs_symlink(&fc, dentry->d_name.name, content, + afs_fs_symlink(&fc, dentry->d_name.name, + content, data_version, &newfid, &newstatus); } @@ -1255,6 +1528,10 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry, goto error_key; } + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + afs_edit_dir_add(dvnode, &dentry->d_name, &newfid, + afs_edit_dir_for_symlink); + key_put(key); _leave(" = 0"); return 0; @@ -1277,6 +1554,8 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, struct afs_fs_cursor fc; struct afs_vnode *orig_dvnode, *new_dvnode, *vnode; struct key *key; + u64 orig_data_version, new_data_version; + bool new_negative = d_is_negative(new_dentry); int ret; if (flags) @@ -1285,6 +1564,8 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, vnode = AFS_FS_I(d_inode(old_dentry)); orig_dvnode = AFS_FS_I(old_dir); new_dvnode = AFS_FS_I(new_dir); + orig_data_version = orig_dvnode->status.data_version; + new_data_version = new_dvnode->status.data_version; _enter("{%x:%u},{%x:%u},{%x:%u},{%pd}", orig_dvnode->fid.vid, orig_dvnode->fid.vnode, @@ -1310,7 +1591,8 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, fc.cb_break = orig_dvnode->cb_break + orig_dvnode->cb_s_break; fc.cb_break_2 = new_dvnode->cb_break + new_dvnode->cb_s_break; afs_fs_rename(&fc, old_dentry->d_name.name, - new_dvnode, new_dentry->d_name.name); + new_dvnode, new_dentry->d_name.name, + orig_data_version, new_data_version); } afs_vnode_commit_status(&fc, orig_dvnode, fc.cb_break); @@ -1322,9 +1604,68 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, goto error_key; } + if (ret == 0) { + if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags)) + afs_edit_dir_remove(orig_dvnode, &old_dentry->d_name, + afs_edit_dir_for_rename); + + if (!new_negative && + test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags)) + afs_edit_dir_remove(new_dvnode, &new_dentry->d_name, + afs_edit_dir_for_rename); + + if (test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags)) + afs_edit_dir_add(new_dvnode, &new_dentry->d_name, + &vnode->fid, afs_edit_dir_for_rename); + } + error_key: key_put(key); error: _leave(" = %d", ret); return ret; } + +/* + * Release a directory page and clean up its private state if it's not busy + * - return true if the page can now be released, false if not + */ +static int afs_dir_releasepage(struct page *page, gfp_t gfp_flags) +{ + struct afs_vnode *dvnode = AFS_FS_I(page->mapping->host); + + _enter("{{%x:%u}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, page->index); + + set_page_private(page, 0); + ClearPagePrivate(page); + + /* The directory will need reloading. */ + if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + afs_stat_v(dvnode, n_relpg); + return 1; +} + +/* + * invalidate part or all of a page + * - release a page and clean up its private data if offset is 0 (indicating + * the entire page) + */ +static void afs_dir_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) +{ + struct afs_vnode *dvnode = AFS_FS_I(page->mapping->host); + + _enter("{%lu},%u,%u", page->index, offset, length); + + BUG_ON(!PageLocked(page)); + + /* The directory will need reloading. */ + if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + afs_stat_v(dvnode, n_inval); + + /* we clean up only if the entire page is being invalidated */ + if (offset == 0 && length == PAGE_SIZE) { + set_page_private(page, 0); + ClearPagePrivate(page); + } +} diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c new file mode 100644 index 000000000000..8b400f5aead5 --- /dev/null +++ b/fs/afs/dir_edit.c @@ -0,0 +1,505 @@ +/* AFS filesystem directory editing + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/pagemap.h> +#include <linux/iversion.h> +#include "internal.h" +#include "xdr_fs.h" + +/* + * Find a number of contiguous clear bits in a directory block bitmask. + * + * There are 64 slots, which means we can load the entire bitmap into a + * variable. The first bit doesn't count as it corresponds to the block header + * slot. nr_slots is between 1 and 9. + */ +static int afs_find_contig_bits(union afs_xdr_dir_block *block, unsigned int nr_slots) +{ + u64 bitmap; + u32 mask; + int bit, n; + + bitmap = (u64)block->hdr.bitmap[0] << 0 * 8; + bitmap |= (u64)block->hdr.bitmap[1] << 1 * 8; + bitmap |= (u64)block->hdr.bitmap[2] << 2 * 8; + bitmap |= (u64)block->hdr.bitmap[3] << 3 * 8; + bitmap |= (u64)block->hdr.bitmap[4] << 4 * 8; + bitmap |= (u64)block->hdr.bitmap[5] << 5 * 8; + bitmap |= (u64)block->hdr.bitmap[6] << 6 * 8; + bitmap |= (u64)block->hdr.bitmap[7] << 7 * 8; + bitmap >>= 1; /* The first entry is metadata */ + bit = 1; + mask = (1 << nr_slots) - 1; + + do { + if (sizeof(unsigned long) == 8) + n = ffz(bitmap); + else + n = ((u32)bitmap) != 0 ? + ffz((u32)bitmap) : + ffz((u32)(bitmap >> 32)) + 32; + bitmap >>= n; + bit += n; + + if ((bitmap & mask) == 0) { + if (bit > 64 - nr_slots) + return -1; + return bit; + } + + n = __ffs(bitmap); + bitmap >>= n; + bit += n; + } while (bitmap); + + return -1; +} + +/* + * Set a number of contiguous bits in the directory block bitmap. + */ +static void afs_set_contig_bits(union afs_xdr_dir_block *block, + int bit, unsigned int nr_slots) +{ + u64 mask, before, after; + + mask = (1 << nr_slots) - 1; + mask <<= bit; + + before = *(u64 *)block->hdr.bitmap; + + block->hdr.bitmap[0] |= (u8)(mask >> 0 * 8); + block->hdr.bitmap[1] |= (u8)(mask >> 1 * 8); + block->hdr.bitmap[2] |= (u8)(mask >> 2 * 8); + block->hdr.bitmap[3] |= (u8)(mask >> 3 * 8); + block->hdr.bitmap[4] |= (u8)(mask >> 4 * 8); + block->hdr.bitmap[5] |= (u8)(mask >> 5 * 8); + block->hdr.bitmap[6] |= (u8)(mask >> 6 * 8); + block->hdr.bitmap[7] |= (u8)(mask >> 7 * 8); + + after = *(u64 *)block->hdr.bitmap; +} + +/* + * Clear a number of contiguous bits in the directory block bitmap. + */ +static void afs_clear_contig_bits(union afs_xdr_dir_block *block, + int bit, unsigned int nr_slots) +{ + u64 mask, before, after; + + mask = (1 << nr_slots) - 1; + mask <<= bit; + + before = *(u64 *)block->hdr.bitmap; + + block->hdr.bitmap[0] &= ~(u8)(mask >> 0 * 8); + block->hdr.bitmap[1] &= ~(u8)(mask >> 1 * 8); + block->hdr.bitmap[2] &= ~(u8)(mask >> 2 * 8); + block->hdr.bitmap[3] &= ~(u8)(mask >> 3 * 8); + block->hdr.bitmap[4] &= ~(u8)(mask >> 4 * 8); + block->hdr.bitmap[5] &= ~(u8)(mask >> 5 * 8); + block->hdr.bitmap[6] &= ~(u8)(mask >> 6 * 8); + block->hdr.bitmap[7] &= ~(u8)(mask >> 7 * 8); + + after = *(u64 *)block->hdr.bitmap; +} + +/* + * Scan a directory block looking for a dirent of the right name. + */ +static int afs_dir_scan_block(union afs_xdr_dir_block *block, struct qstr *name, + unsigned int blocknum) +{ + union afs_xdr_dirent *de; + u64 bitmap; + int d, len, n; + + _enter(""); + + bitmap = (u64)block->hdr.bitmap[0] << 0 * 8; + bitmap |= (u64)block->hdr.bitmap[1] << 1 * 8; + bitmap |= (u64)block->hdr.bitmap[2] << 2 * 8; + bitmap |= (u64)block->hdr.bitmap[3] << 3 * 8; + bitmap |= (u64)block->hdr.bitmap[4] << 4 * 8; + bitmap |= (u64)block->hdr.bitmap[5] << 5 * 8; + bitmap |= (u64)block->hdr.bitmap[6] << 6 * 8; + bitmap |= (u64)block->hdr.bitmap[7] << 7 * 8; + + for (d = (blocknum == 0 ? AFS_DIR_RESV_BLOCKS0 : AFS_DIR_RESV_BLOCKS); + d < AFS_DIR_SLOTS_PER_BLOCK; + d++) { + if (!((bitmap >> d) & 1)) + continue; + de = &block->dirents[d]; + if (de->u.valid != 1) + continue; + + /* The block was NUL-terminated by afs_dir_check_page(). */ + len = strlen(de->u.name); + if (len == name->len && + memcmp(de->u.name, name->name, name->len) == 0) + return d; + + n = round_up(12 + len + 1 + 4, AFS_DIR_DIRENT_SIZE); + n /= AFS_DIR_DIRENT_SIZE; + d += n - 1; + } + + return -1; +} + +/* + * Initialise a new directory block. Note that block 0 is special and contains + * some extra metadata. + */ +static void afs_edit_init_block(union afs_xdr_dir_block *meta, + union afs_xdr_dir_block *block, int block_num) +{ + memset(block, 0, sizeof(*block)); + block->hdr.npages = htons(1); + block->hdr.magic = AFS_DIR_MAGIC; + block->hdr.bitmap[0] = 1; + + if (block_num == 0) { + block->hdr.bitmap[0] = 0xff; + block->hdr.bitmap[1] = 0x1f; + memset(block->meta.alloc_ctrs, + AFS_DIR_SLOTS_PER_BLOCK, + sizeof(block->meta.alloc_ctrs)); + meta->meta.alloc_ctrs[0] = + AFS_DIR_SLOTS_PER_BLOCK - AFS_DIR_RESV_BLOCKS0; + } + + if (block_num < AFS_DIR_BLOCKS_WITH_CTR) + meta->meta.alloc_ctrs[block_num] = + AFS_DIR_SLOTS_PER_BLOCK - AFS_DIR_RESV_BLOCKS; +} + +/* + * Edit a directory's file data to add a new directory entry. Doing this after + * create, mkdir, symlink, link or rename if the data version number is + * incremented by exactly one avoids the need to re-download the entire + * directory contents. + * + * The caller must hold the inode locked. + */ +void afs_edit_dir_add(struct afs_vnode *vnode, + struct qstr *name, struct afs_fid *new_fid, + enum afs_edit_dir_reason why) +{ + union afs_xdr_dir_block *meta, *block; + struct afs_xdr_dir_page *meta_page, *dir_page; + union afs_xdr_dirent *de; + struct page *page0, *page; + unsigned int need_slots, nr_blocks, b; + pgoff_t index; + loff_t i_size; + gfp_t gfp; + int slot; + + _enter(",,{%d,%s},", name->len, name->name); + + i_size = i_size_read(&vnode->vfs_inode); + if (i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS || + (i_size & (AFS_DIR_BLOCK_SIZE - 1))) { + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + return; + } + + gfp = vnode->vfs_inode.i_mapping->gfp_mask; + page0 = find_or_create_page(vnode->vfs_inode.i_mapping, 0, gfp); + if (!page0) { + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + _leave(" [fgp]"); + return; + } + + /* Work out how many slots we're going to need. */ + need_slots = round_up(12 + name->len + 1 + 4, AFS_DIR_DIRENT_SIZE); + need_slots /= AFS_DIR_DIRENT_SIZE; + + meta_page = kmap(page0); + meta = &meta_page->blocks[0]; + if (i_size == 0) + goto new_directory; + nr_blocks = i_size / AFS_DIR_BLOCK_SIZE; + + /* Find a block that has sufficient slots available. Each VM page + * contains two or more directory blocks. + */ + for (b = 0; b < nr_blocks + 1; b++) { + /* If the directory extended into a new page, then we need to + * tack a new page on the end. + */ + index = b / AFS_DIR_BLOCKS_PER_PAGE; + if (index == 0) { + page = page0; + dir_page = meta_page; + } else { + if (nr_blocks >= AFS_DIR_MAX_BLOCKS) + goto error; + gfp = vnode->vfs_inode.i_mapping->gfp_mask; + page = find_or_create_page(vnode->vfs_inode.i_mapping, + index, gfp); + if (!page) + goto error; + if (!PagePrivate(page)) { + set_page_private(page, 1); + SetPagePrivate(page); + } + dir_page = kmap(page); + } + + /* Abandon the edit if we got a callback break. */ + if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) + goto invalidated; + + block = &dir_page->blocks[b % AFS_DIR_BLOCKS_PER_PAGE]; + + _debug("block %u: %2u %3u %u", + b, + (b < AFS_DIR_BLOCKS_WITH_CTR) ? meta->meta.alloc_ctrs[b] : 99, + ntohs(block->hdr.npages), + ntohs(block->hdr.magic)); + + /* Initialise the block if necessary. */ + if (b == nr_blocks) { + _debug("init %u", b); + afs_edit_init_block(meta, block, b); + i_size_write(&vnode->vfs_inode, (b + 1) * AFS_DIR_BLOCK_SIZE); + } + + /* Only lower dir pages have a counter in the header. */ + if (b >= AFS_DIR_BLOCKS_WITH_CTR || + meta->meta.alloc_ctrs[b] >= need_slots) { + /* We need to try and find one or more consecutive + * slots to hold the entry. + */ + slot = afs_find_contig_bits(block, need_slots); + if (slot >= 0) { + _debug("slot %u", slot); + goto found_space; + } + } + + if (page != page0) { + unlock_page(page); + kunmap(page); + put_page(page); + } + } + + /* There are no spare slots of sufficient size, yet the operation + * succeeded. Download the directory again. + */ + trace_afs_edit_dir(vnode, why, afs_edit_dir_create_nospc, 0, 0, 0, 0, name->name); + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + goto out_unmap; + +new_directory: + afs_edit_init_block(meta, meta, 0); + i_size = AFS_DIR_BLOCK_SIZE; + i_size_write(&vnode->vfs_inode, i_size); + slot = AFS_DIR_RESV_BLOCKS0; + page = page0; + block = meta; + nr_blocks = 1; + b = 0; + +found_space: + /* Set the dirent slot. */ + trace_afs_edit_dir(vnode, why, afs_edit_dir_create, b, slot, + new_fid->vnode, new_fid->unique, name->name); + de = &block->dirents[slot]; + de->u.valid = 1; + de->u.unused[0] = 0; + de->u.hash_next = 0; // TODO: Really need to maintain this + de->u.vnode = htonl(new_fid->vnode); + de->u.unique = htonl(new_fid->unique); + memcpy(de->u.name, name->name, name->len + 1); + de->u.name[name->len] = 0; + + /* Adjust the bitmap. */ + afs_set_contig_bits(block, slot, need_slots); + if (page != page0) { + unlock_page(page); + kunmap(page); + put_page(page); + } + + /* Adjust the allocation counter. */ + if (b < AFS_DIR_BLOCKS_WITH_CTR) + meta->meta.alloc_ctrs[b] -= need_slots; + + inode_inc_iversion_raw(&vnode->vfs_inode); + afs_stat_v(vnode, n_dir_cr); + _debug("Insert %s in %u[%u]", name->name, b, slot); + +out_unmap: + unlock_page(page0); + kunmap(page0); + put_page(page0); + _leave(""); + return; + +invalidated: + trace_afs_edit_dir(vnode, why, afs_edit_dir_create_inval, 0, 0, 0, 0, name->name); + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + if (page != page0) { + kunmap(page); + put_page(page); + } + goto out_unmap; + +error: + trace_afs_edit_dir(vnode, why, afs_edit_dir_create_error, 0, 0, 0, 0, name->name); + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + goto out_unmap; +} + +/* + * Edit a directory's file data to remove a new directory entry. Doing this + * after unlink, rmdir or rename if the data version number is incremented by + * exactly one avoids the need to re-download the entire directory contents. + * + * The caller must hold the inode locked. + */ +void afs_edit_dir_remove(struct afs_vnode *vnode, + struct qstr *name, enum afs_edit_dir_reason why) +{ + struct afs_xdr_dir_page *meta_page, *dir_page; + union afs_xdr_dir_block *meta, *block; + union afs_xdr_dirent *de; + struct page *page0, *page; + unsigned int need_slots, nr_blocks, b; + pgoff_t index; + loff_t i_size; + int slot; + + _enter(",,{%d,%s},", name->len, name->name); + + i_size = i_size_read(&vnode->vfs_inode); + if (i_size < AFS_DIR_BLOCK_SIZE || + i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS || + (i_size & (AFS_DIR_BLOCK_SIZE - 1))) { + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + return; + } + nr_blocks = i_size / AFS_DIR_BLOCK_SIZE; + + page0 = find_lock_page(vnode->vfs_inode.i_mapping, 0); + if (!page0) { + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + _leave(" [fgp]"); + return; + } + + /* Work out how many slots we're going to discard. */ + need_slots = round_up(12 + name->len + 1 + 4, AFS_DIR_DIRENT_SIZE); + need_slots /= AFS_DIR_DIRENT_SIZE; + + meta_page = kmap(page0); + meta = &meta_page->blocks[0]; + + /* Find a page that has sufficient slots available. Each VM page + * contains two or more directory blocks. + */ + for (b = 0; b < nr_blocks; b++) { + index = b / AFS_DIR_BLOCKS_PER_PAGE; + if (index != 0) { + page = find_lock_page(vnode->vfs_inode.i_mapping, index); + if (!page) + goto error; + dir_page = kmap(page); + } else { + page = page0; + dir_page = meta_page; + } + + /* Abandon the edit if we got a callback break. */ + if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) + goto invalidated; + + block = &dir_page->blocks[b % AFS_DIR_BLOCKS_PER_PAGE]; + + if (b > AFS_DIR_BLOCKS_WITH_CTR || + meta->meta.alloc_ctrs[b] <= AFS_DIR_SLOTS_PER_BLOCK - 1 - need_slots) { + slot = afs_dir_scan_block(block, name, b); + if (slot >= 0) + goto found_dirent; + } + + if (page != page0) { + unlock_page(page); + kunmap(page); + put_page(page); + } + } + + /* Didn't find the dirent to clobber. Download the directory again. */ + trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_noent, + 0, 0, 0, 0, name->name); + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + goto out_unmap; + +found_dirent: + de = &block->dirents[slot]; + + trace_afs_edit_dir(vnode, why, afs_edit_dir_delete, b, slot, + ntohl(de->u.vnode), ntohl(de->u.unique), + name->name); + + memset(de, 0, sizeof(*de) * need_slots); + + /* Adjust the bitmap. */ + afs_clear_contig_bits(block, slot, need_slots); + if (page != page0) { + unlock_page(page); + kunmap(page); + put_page(page); + } + + /* Adjust the allocation counter. */ + if (b < AFS_DIR_BLOCKS_WITH_CTR) + meta->meta.alloc_ctrs[b] += need_slots; + + inode_set_iversion_raw(&vnode->vfs_inode, vnode->status.data_version); + afs_stat_v(vnode, n_dir_rm); + _debug("Remove %s from %u[%u]", name->name, b, slot); + +out_unmap: + unlock_page(page0); + kunmap(page0); + put_page(page0); + _leave(""); + return; + +invalidated: + trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_inval, + 0, 0, 0, 0, name->name); + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + if (page != page0) { + unlock_page(page); + kunmap(page); + put_page(page); + } + goto out_unmap; + +error: + trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_error, + 0, 0, 0, 0, name->name); + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + goto out_unmap; +} diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c new file mode 100644 index 000000000000..983f3946ab57 --- /dev/null +++ b/fs/afs/dynroot.c @@ -0,0 +1,209 @@ +/* dir.c: AFS dynamic root handling + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/dns_resolver.h> +#include "internal.h" + +const struct file_operations afs_dynroot_file_operations = { + .open = dcache_dir_open, + .release = dcache_dir_close, + .iterate_shared = dcache_readdir, + .llseek = dcache_dir_lseek, +}; + +/* + * Probe to see if a cell may exist. This prevents positive dentries from + * being created unnecessarily. + */ +static int afs_probe_cell_name(struct dentry *dentry) +{ + struct afs_cell *cell; + const char *name = dentry->d_name.name; + size_t len = dentry->d_name.len; + int ret; + + /* Names prefixed with a dot are R/W mounts. */ + if (name[0] == '.') { + if (len == 1) + return -EINVAL; + name++; + len--; + } + + cell = afs_lookup_cell_rcu(afs_d2net(dentry), name, len); + if (!IS_ERR(cell)) { + afs_put_cell(afs_d2net(dentry), cell); + return 0; + } + + ret = dns_query("afsdb", name, len, "ipv4", NULL, NULL); + if (ret == -ENODATA) + ret = -EDESTADDRREQ; + return ret; +} + +/* + * Try to auto mount the mountpoint with pseudo directory, if the autocell + * operation is setted. + */ +struct inode *afs_try_auto_mntpt(struct dentry *dentry, struct inode *dir) +{ + struct afs_vnode *vnode = AFS_FS_I(dir); + struct inode *inode; + int ret = -ENOENT; + + _enter("%p{%pd}, {%x:%u}", + dentry, dentry, vnode->fid.vid, vnode->fid.vnode); + + if (!test_bit(AFS_VNODE_AUTOCELL, &vnode->flags)) + goto out; + + ret = afs_probe_cell_name(dentry); + if (ret < 0) + goto out; + + inode = afs_iget_pseudo_dir(dir->i_sb, false); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto out; + } + + _leave("= %p", inode); + return inode; + +out: + _leave("= %d", ret); + return ERR_PTR(ret); +} + +/* + * Look up @cell in a dynroot directory. This is a substitution for the + * local cell name for the net namespace. + */ +static struct dentry *afs_lookup_atcell(struct dentry *dentry) +{ + struct afs_cell *cell; + struct afs_net *net = afs_d2net(dentry); + struct dentry *ret; + unsigned int seq = 0; + char *name; + int len; + + if (!net->ws_cell) + return ERR_PTR(-ENOENT); + + ret = ERR_PTR(-ENOMEM); + name = kmalloc(AFS_MAXCELLNAME + 1, GFP_KERNEL); + if (!name) + goto out_p; + + rcu_read_lock(); + do { + read_seqbegin_or_lock(&net->cells_lock, &seq); + cell = rcu_dereference_raw(net->ws_cell); + if (cell) { + len = cell->name_len; + memcpy(name, cell->name, len + 1); + } + } while (need_seqretry(&net->cells_lock, seq)); + done_seqretry(&net->cells_lock, seq); + rcu_read_unlock(); + + ret = ERR_PTR(-ENOENT); + if (!cell) + goto out_n; + + ret = lookup_one_len(name, dentry->d_parent, len); + + /* We don't want to d_add() the @cell dentry here as we don't want to + * the cached dentry to hide changes to the local cell name. + */ + +out_n: + kfree(name); +out_p: + return ret; +} + +/* + * Look up an entry in a dynroot directory. + */ +static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct afs_vnode *vnode; + struct inode *inode; + int ret; + + vnode = AFS_FS_I(dir); + + _enter("%pd", dentry); + + ASSERTCMP(d_inode(dentry), ==, NULL); + + if (dentry->d_name.len >= AFSNAMEMAX) { + _leave(" = -ENAMETOOLONG"); + return ERR_PTR(-ENAMETOOLONG); + } + + if (dentry->d_name.len == 5 && + memcmp(dentry->d_name.name, "@cell", 5) == 0) + return afs_lookup_atcell(dentry); + + inode = afs_try_auto_mntpt(dentry, dir); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + if (ret == -ENOENT) { + d_add(dentry, NULL); + _leave(" = NULL [negative]"); + return NULL; + } + _leave(" = %d [do]", ret); + return ERR_PTR(ret); + } + + d_add(dentry, inode); + _leave(" = 0 { ino=%lu v=%u }", + d_inode(dentry)->i_ino, d_inode(dentry)->i_generation); + return NULL; +} + +const struct inode_operations afs_dynroot_inode_operations = { + .lookup = afs_dynroot_lookup, +}; + +/* + * Dirs in the dynamic root don't need revalidation. + */ +static int afs_dynroot_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + return 1; +} + +/* + * Allow the VFS to enquire as to whether a dentry should be unhashed (mustn't + * sleep) + * - called from dput() when d_count is going to 0. + * - return 1 to request dentry be unhashed, 0 otherwise + */ +static int afs_dynroot_d_delete(const struct dentry *dentry) +{ + return d_really_is_positive(dentry); +} + +const struct dentry_operations afs_dynroot_dentry_operations = { + .d_revalidate = afs_dynroot_d_revalidate, + .d_delete = afs_dynroot_d_delete, + .d_release = afs_d_release, + .d_automount = afs_d_automount, +}; diff --git a/fs/afs/file.c b/fs/afs/file.c index a39192ced99e..c24c08016dd9 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -30,7 +30,6 @@ static int afs_readpages(struct file *filp, struct address_space *mapping, const struct file_operations afs_file_operations = { .open = afs_open, - .flush = afs_flush, .release = afs_release, .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, @@ -146,6 +145,9 @@ int afs_open(struct inode *inode, struct file *file) if (ret < 0) goto error_af; } + + if (file->f_flags & O_TRUNC) + set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); file->private_data = af; _leave(" = 0"); @@ -170,6 +172,9 @@ int afs_release(struct inode *inode, struct file *file) _enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode); + if ((file->f_mode & FMODE_WRITE)) + return vfs_fsync(file, 0); + file->private_data = NULL; if (af->wb) afs_put_wb_key(af->wb); @@ -187,10 +192,12 @@ void afs_put_read(struct afs_read *req) { int i; - if (atomic_dec_and_test(&req->usage)) { + if (refcount_dec_and_test(&req->usage)) { for (i = 0; i < req->nr_pages; i++) if (req->pages[i]) put_page(req->pages[i]); + if (req->pages != req->array) + kfree(req->pages); kfree(req); } } @@ -240,6 +247,12 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *de ret = afs_end_vnode_operation(&fc); } + if (ret == 0) { + afs_stat_v(vnode, n_fetches); + atomic_long_add(desc->actual_len, + &afs_v2net(vnode)->n_fetch_bytes); + } + _leave(" = %d", ret); return ret; } @@ -297,10 +310,11 @@ int afs_page_filler(void *data, struct page *page) * end of the file, the server will return a short read and the * unmarshalling code will clear the unfilled space. */ - atomic_set(&req->usage, 1); + refcount_set(&req->usage, 1); req->pos = (loff_t)page->index << PAGE_SHIFT; req->len = PAGE_SIZE; req->nr_pages = 1; + req->pages = req->array; req->pages[0] = page; get_page(page); @@ -309,10 +323,6 @@ int afs_page_filler(void *data, struct page *page) ret = afs_fetch_data(vnode, key, req); afs_put_read(req); - if (ret >= 0 && S_ISDIR(inode->i_mode) && - !afs_dir_check_page(inode, page)) - ret = -EIO; - if (ret < 0) { if (ret == -ENOENT) { _debug("got NOENT from server" @@ -339,7 +349,8 @@ int afs_page_filler(void *data, struct page *page) /* send the page to the cache */ #ifdef CONFIG_AFS_FSCACHE if (PageFsCache(page) && - fscache_write_page(vnode->cache, page, GFP_KERNEL) != 0) { + fscache_write_page(vnode->cache, page, vnode->status.size, + GFP_KERNEL) != 0) { fscache_uncache_page(vnode->cache, page); BUG_ON(PageFsCache(page)); } @@ -403,7 +414,8 @@ static void afs_readpages_page_done(struct afs_call *call, struct afs_read *req) /* send the page to the cache */ #ifdef CONFIG_AFS_FSCACHE if (PageFsCache(page) && - fscache_write_page(vnode->cache, page, GFP_KERNEL) != 0) { + fscache_write_page(vnode->cache, page, vnode->status.size, + GFP_KERNEL) != 0) { fscache_uncache_page(vnode->cache, page); BUG_ON(PageFsCache(page)); } @@ -445,10 +457,11 @@ static int afs_readpages_one(struct file *file, struct address_space *mapping, if (!req) return -ENOMEM; - atomic_set(&req->usage, 1); + refcount_set(&req->usage, 1); req->page_done = afs_readpages_page_done; req->pos = first->index; req->pos <<= PAGE_SHIFT; + req->pages = req->array; /* Transfer the pages to the request. We add them in until one fails * to add to the LRU and then we stop (as that'll make a hole in the diff --git a/fs/afs/flock.c b/fs/afs/flock.c index c40ba2fe3cbe..7a0e017070ec 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -613,7 +613,7 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl) posix_test_lock(file, fl); if (fl->fl_type == F_UNLCK) { /* no local locks; consult the server */ - ret = afs_fetch_status(vnode, key); + ret = afs_fetch_status(vnode, key, false); if (ret < 0) goto error; diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 88ec38c2d83c..efacdb7c1dee 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -16,6 +16,7 @@ #include <linux/iversion.h> #include "internal.h" #include "afs_fs.h" +#include "xdr_fs.h" static const struct afs_fid afs_zero_fid; @@ -44,109 +45,194 @@ static void xdr_decode_AFSFid(const __be32 **_bp, struct afs_fid *fid) } /* - * decode an AFSFetchStatus block + * Dump a bad file status record. */ -static void xdr_decode_AFSFetchStatus(const __be32 **_bp, - struct afs_file_status *status, - struct afs_vnode *vnode, - afs_dataversion_t *store_version) +static void xdr_dump_bad(const __be32 *bp) { - afs_dataversion_t expected_version; - const __be32 *bp = *_bp; + __be32 x[4]; + int i; + + pr_notice("AFS XDR: Bad status record\n"); + for (i = 0; i < 5 * 4 * 4; i += 16) { + memcpy(x, bp, 16); + bp += 4; + pr_notice("%03x: %08x %08x %08x %08x\n", + i, ntohl(x[0]), ntohl(x[1]), ntohl(x[2]), ntohl(x[3])); + } + + memcpy(x, bp, 4); + pr_notice("0x50: %08x\n", ntohl(x[0])); +} + +/* + * Update the core inode struct from a returned status record. + */ +void afs_update_inode_from_status(struct afs_vnode *vnode, + struct afs_file_status *status, + const afs_dataversion_t *expected_version, + u8 flags) +{ + struct timespec t; umode_t mode; + + t.tv_sec = status->mtime_client; + t.tv_nsec = 0; + vnode->vfs_inode.i_ctime = t; + vnode->vfs_inode.i_mtime = t; + vnode->vfs_inode.i_atime = t; + + if (flags & (AFS_VNODE_META_CHANGED | AFS_VNODE_NOT_YET_SET)) { + vnode->vfs_inode.i_uid = make_kuid(&init_user_ns, status->owner); + vnode->vfs_inode.i_gid = make_kgid(&init_user_ns, status->group); + set_nlink(&vnode->vfs_inode, status->nlink); + + mode = vnode->vfs_inode.i_mode; + mode &= ~S_IALLUGO; + mode |= status->mode; + barrier(); + vnode->vfs_inode.i_mode = mode; + } + + if (!(flags & AFS_VNODE_NOT_YET_SET)) { + if (expected_version && + *expected_version != status->data_version) { + _debug("vnode modified %llx on {%x:%u} [exp %llx]", + (unsigned long long) status->data_version, + vnode->fid.vid, vnode->fid.vnode, + (unsigned long long) *expected_version); + vnode->invalid_before = status->data_version; + if (vnode->status.type == AFS_FTYPE_DIR) { + if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) + afs_stat_v(vnode, n_inval); + } else { + set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags); + } + } else if (vnode->status.type == AFS_FTYPE_DIR) { + /* Expected directory change is handled elsewhere so + * that we can locally edit the directory and save on a + * download. + */ + if (test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) + flags &= ~AFS_VNODE_DATA_CHANGED; + } + } + + if (flags & (AFS_VNODE_DATA_CHANGED | AFS_VNODE_NOT_YET_SET)) { + inode_set_iversion_raw(&vnode->vfs_inode, status->data_version); + i_size_write(&vnode->vfs_inode, status->size); + } +} + +/* + * decode an AFSFetchStatus block + */ +static int xdr_decode_AFSFetchStatus(struct afs_call *call, + const __be32 **_bp, + struct afs_file_status *status, + struct afs_vnode *vnode, + const afs_dataversion_t *expected_version, + struct afs_read *read_req) +{ + const struct afs_xdr_AFSFetchStatus *xdr = (const void *)*_bp; u64 data_version, size; - bool changed = false; - kuid_t owner; - kgid_t group; + u32 type, abort_code; + u8 flags = 0; + int ret; if (vnode) write_seqlock(&vnode->cb_lock); -#define EXTRACT(DST) \ - do { \ - u32 x = ntohl(*bp++); \ - if (DST != x) \ - changed |= true; \ - DST = x; \ - } while (0) - - status->if_version = ntohl(*bp++); - EXTRACT(status->type); - EXTRACT(status->nlink); - size = ntohl(*bp++); - data_version = ntohl(*bp++); - EXTRACT(status->author); - owner = make_kuid(&init_user_ns, ntohl(*bp++)); - changed |= !uid_eq(owner, status->owner); - status->owner = owner; - EXTRACT(status->caller_access); /* call ticket dependent */ - EXTRACT(status->anon_access); - EXTRACT(status->mode); - bp++; /* parent.vnode */ - bp++; /* parent.unique */ - bp++; /* seg size */ - status->mtime_client = ntohl(*bp++); - status->mtime_server = ntohl(*bp++); - group = make_kgid(&init_user_ns, ntohl(*bp++)); - changed |= !gid_eq(group, status->group); - status->group = group; - bp++; /* sync counter */ - data_version |= (u64) ntohl(*bp++) << 32; - EXTRACT(status->lock_count); - size |= (u64) ntohl(*bp++) << 32; - bp++; /* spare 4 */ - *_bp = bp; + if (xdr->if_version != htonl(AFS_FSTATUS_VERSION)) { + pr_warn("Unknown AFSFetchStatus version %u\n", ntohl(xdr->if_version)); + goto bad; + } - if (size != status->size) { - status->size = size; - changed |= true; + type = ntohl(xdr->type); + abort_code = ntohl(xdr->abort_code); + switch (type) { + case AFS_FTYPE_FILE: + case AFS_FTYPE_DIR: + case AFS_FTYPE_SYMLINK: + if (type != status->type && + vnode && + !test_bit(AFS_VNODE_UNSET, &vnode->flags)) { + pr_warning("Vnode %x:%x:%x changed type %u to %u\n", + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + status->type, type); + goto bad; + } + status->type = type; + break; + case AFS_FTYPE_INVALID: + if (abort_code != 0) { + status->abort_code = abort_code; + ret = 0; + goto out; + } + /* Fall through */ + default: + goto bad; } - status->mode &= S_IALLUGO; - _debug("vnode time %lx, %lx", - status->mtime_client, status->mtime_server); +#define EXTRACT_M(FIELD) \ + do { \ + u32 x = ntohl(xdr->FIELD); \ + if (status->FIELD != x) { \ + flags |= AFS_VNODE_META_CHANGED; \ + status->FIELD = x; \ + } \ + } while (0) - if (vnode) { - if (changed && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) { - _debug("vnode changed"); - i_size_write(&vnode->vfs_inode, size); - vnode->vfs_inode.i_uid = status->owner; - vnode->vfs_inode.i_gid = status->group; - vnode->vfs_inode.i_generation = vnode->fid.unique; - set_nlink(&vnode->vfs_inode, status->nlink); - - mode = vnode->vfs_inode.i_mode; - mode &= ~S_IALLUGO; - mode |= status->mode; - barrier(); - vnode->vfs_inode.i_mode = mode; - } + EXTRACT_M(nlink); + EXTRACT_M(author); + EXTRACT_M(owner); + EXTRACT_M(caller_access); /* call ticket dependent */ + EXTRACT_M(anon_access); + EXTRACT_M(mode); + EXTRACT_M(group); + + status->mtime_client = ntohl(xdr->mtime_client); + status->mtime_server = ntohl(xdr->mtime_server); + status->lock_count = ntohl(xdr->lock_count); + + size = (u64)ntohl(xdr->size_lo); + size |= (u64)ntohl(xdr->size_hi) << 32; + status->size = size; + + data_version = (u64)ntohl(xdr->data_version_lo); + data_version |= (u64)ntohl(xdr->data_version_hi) << 32; + if (data_version != status->data_version) { + status->data_version = data_version; + flags |= AFS_VNODE_DATA_CHANGED; + } - vnode->vfs_inode.i_ctime.tv_sec = status->mtime_client; - vnode->vfs_inode.i_mtime = vnode->vfs_inode.i_ctime; - vnode->vfs_inode.i_atime = vnode->vfs_inode.i_ctime; - inode_set_iversion_raw(&vnode->vfs_inode, data_version); + if (read_req) { + read_req->data_version = data_version; + read_req->file_size = size; } - expected_version = status->data_version; - if (store_version) - expected_version = *store_version; + *_bp = (const void *)*_bp + sizeof(*xdr); - if (expected_version != data_version) { - status->data_version = data_version; - if (vnode && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) { - _debug("vnode modified %llx on {%x:%u}", - (unsigned long long) data_version, - vnode->fid.vid, vnode->fid.vnode); - set_bit(AFS_VNODE_DIR_MODIFIED, &vnode->flags); - set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags); - } - } else if (store_version) { - status->data_version = data_version; + if (vnode) { + if (test_bit(AFS_VNODE_UNSET, &vnode->flags)) + flags |= AFS_VNODE_NOT_YET_SET; + afs_update_inode_from_status(vnode, status, expected_version, + flags); } + ret = 0; + +out: if (vnode) write_sequnlock(&vnode->cb_lock); + return ret; + +bad: + xdr_dump_bad(*_bp); + ret = afs_protocol_error(call, -EBADMSG); + goto out; } /* @@ -274,7 +360,7 @@ static void xdr_decode_AFSFetchVolumeStatus(const __be32 **_bp, /* * deliver reply data to an FS.FetchStatus */ -static int afs_deliver_fs_fetch_status(struct afs_call *call) +static int afs_deliver_fs_fetch_status_vnode(struct afs_call *call) { struct afs_vnode *vnode = call->reply[0]; const __be32 *bp; @@ -288,7 +374,9 @@ static int afs_deliver_fs_fetch_status(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); + if (xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL) < 0) + return afs_protocol_error(call, -EBADMSG); xdr_decode_AFSCallBack(call, vnode, &bp); if (call->reply[1]) xdr_decode_AFSVolSync(&bp, call->reply[1]); @@ -300,17 +388,18 @@ static int afs_deliver_fs_fetch_status(struct afs_call *call) /* * FS.FetchStatus operation type */ -static const struct afs_call_type afs_RXFSFetchStatus = { - .name = "FS.FetchStatus", +static const struct afs_call_type afs_RXFSFetchStatus_vnode = { + .name = "FS.FetchStatus(vnode)", .op = afs_FS_FetchStatus, - .deliver = afs_deliver_fs_fetch_status, + .deliver = afs_deliver_fs_fetch_status_vnode, .destructor = afs_flat_call_destructor, }; /* * fetch the status information for a file */ -int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsync) +int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsync, + bool new_inode) { struct afs_vnode *vnode = fc->vnode; struct afs_call *call; @@ -320,7 +409,8 @@ int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsy _enter(",%x,{%x:%u},,", key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); - call = afs_alloc_flat_call(net, &afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4); + call = afs_alloc_flat_call(net, &afs_RXFSFetchStatus_vnode, + 16, (21 + 3 + 6) * 4); if (!call) { fc->ac.error = -ENOMEM; return -ENOMEM; @@ -329,6 +419,7 @@ int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsy call->key = fc->key; call->reply[0] = vnode; call->reply[1] = volsync; + call->expected_version = new_inode ? 1 : vnode->status.data_version; /* marshall the parameters */ bp = call->request; @@ -464,7 +555,9 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) return ret; bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); + if (xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode, + &vnode->status.data_version, req) < 0) + return afs_protocol_error(call, -EBADMSG); xdr_decode_AFSCallBack(call, vnode, &bp); if (call->reply[1]) xdr_decode_AFSVolSync(&bp, call->reply[1]); @@ -534,6 +627,7 @@ static int afs_fs_fetch_data64(struct afs_fs_cursor *fc, struct afs_read *req) call->reply[0] = vnode; call->reply[1] = NULL; /* volsync */ call->reply[2] = req; + call->expected_version = vnode->status.data_version; /* marshall the parameters */ bp = call->request; @@ -546,7 +640,7 @@ static int afs_fs_fetch_data64(struct afs_fs_cursor *fc, struct afs_read *req) bp[6] = 0; bp[7] = htonl(lower_32_bits(req->len)); - atomic_inc(&req->usage); + refcount_inc(&req->usage); call->cb_break = fc->cb_break; afs_use_fs_server(call, fc->cbi); trace_afs_make_fs_call(call, &vnode->fid); @@ -578,6 +672,7 @@ int afs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_read *req) call->reply[0] = vnode; call->reply[1] = NULL; /* volsync */ call->reply[2] = req; + call->expected_version = vnode->status.data_version; /* marshall the parameters */ bp = call->request; @@ -588,7 +683,7 @@ int afs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_read *req) bp[4] = htonl(lower_32_bits(req->pos)); bp[5] = htonl(lower_32_bits(req->len)); - atomic_inc(&req->usage); + refcount_inc(&req->usage); call->cb_break = fc->cb_break; afs_use_fs_server(call, fc->cbi); trace_afs_make_fs_call(call, &vnode->fid); @@ -613,8 +708,10 @@ static int afs_deliver_fs_create_vnode(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; xdr_decode_AFSFid(&bp, call->reply[1]); - xdr_decode_AFSFetchStatus(&bp, call->reply[2], NULL, NULL); - xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); + if (xdr_decode_AFSFetchStatus(call, &bp, call->reply[2], NULL, NULL, NULL) < 0 || + xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL) < 0) + return afs_protocol_error(call, -EBADMSG); xdr_decode_AFSCallBack_raw(&bp, call->reply[3]); /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ @@ -645,6 +742,7 @@ static const struct afs_call_type afs_RXFSMakeDir = { int afs_fs_create(struct afs_fs_cursor *fc, const char *name, umode_t mode, + u64 current_data_version, struct afs_fid *newfid, struct afs_file_status *newstatus, struct afs_callback *newcb) @@ -672,6 +770,7 @@ int afs_fs_create(struct afs_fs_cursor *fc, call->reply[1] = newfid; call->reply[2] = newstatus; call->reply[3] = newcb; + call->expected_version = current_data_version + 1; /* marshall the parameters */ bp = call->request; @@ -715,7 +814,9 @@ static int afs_deliver_fs_remove(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); + if (xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL) < 0) + return afs_protocol_error(call, -EBADMSG); /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ _leave(" = 0 [done]"); @@ -742,7 +843,8 @@ static const struct afs_call_type afs_RXFSRemoveDir = { /* * remove a file or directory */ -int afs_fs_remove(struct afs_fs_cursor *fc, const char *name, bool isdir) +int afs_fs_remove(struct afs_fs_cursor *fc, const char *name, bool isdir, + u64 current_data_version) { struct afs_vnode *vnode = fc->vnode; struct afs_call *call; @@ -764,6 +866,7 @@ int afs_fs_remove(struct afs_fs_cursor *fc, const char *name, bool isdir) call->key = fc->key; call->reply[0] = vnode; + call->expected_version = current_data_version + 1; /* marshall the parameters */ bp = call->request; @@ -801,8 +904,10 @@ static int afs_deliver_fs_link(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); - xdr_decode_AFSFetchStatus(&bp, &dvnode->status, dvnode, NULL); + if (xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode, NULL, NULL) < 0 || + xdr_decode_AFSFetchStatus(call, &bp, &dvnode->status, dvnode, + &call->expected_version, NULL) < 0) + return afs_protocol_error(call, -EBADMSG); /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ _leave(" = 0 [done]"); @@ -823,7 +928,7 @@ static const struct afs_call_type afs_RXFSLink = { * make a hard link */ int afs_fs_link(struct afs_fs_cursor *fc, struct afs_vnode *vnode, - const char *name) + const char *name, u64 current_data_version) { struct afs_vnode *dvnode = fc->vnode; struct afs_call *call; @@ -844,6 +949,7 @@ int afs_fs_link(struct afs_fs_cursor *fc, struct afs_vnode *vnode, call->key = fc->key; call->reply[0] = dvnode; call->reply[1] = vnode; + call->expected_version = current_data_version + 1; /* marshall the parameters */ bp = call->request; @@ -885,8 +991,10 @@ static int afs_deliver_fs_symlink(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; xdr_decode_AFSFid(&bp, call->reply[1]); - xdr_decode_AFSFetchStatus(&bp, call->reply[2], NULL, NULL); - xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); + if (xdr_decode_AFSFetchStatus(call, &bp, call->reply[2], NULL, NULL, NULL) || + xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL) < 0) + return afs_protocol_error(call, -EBADMSG); /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ _leave(" = 0 [done]"); @@ -909,6 +1017,7 @@ static const struct afs_call_type afs_RXFSSymlink = { int afs_fs_symlink(struct afs_fs_cursor *fc, const char *name, const char *contents, + u64 current_data_version, struct afs_fid *newfid, struct afs_file_status *newstatus) { @@ -937,6 +1046,7 @@ int afs_fs_symlink(struct afs_fs_cursor *fc, call->reply[0] = vnode; call->reply[1] = newfid; call->reply[2] = newstatus; + call->expected_version = current_data_version + 1; /* marshall the parameters */ bp = call->request; @@ -987,10 +1097,13 @@ static int afs_deliver_fs_rename(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, &orig_dvnode->status, orig_dvnode, NULL); - if (new_dvnode != orig_dvnode) - xdr_decode_AFSFetchStatus(&bp, &new_dvnode->status, new_dvnode, - NULL); + if (xdr_decode_AFSFetchStatus(call, &bp, &orig_dvnode->status, orig_dvnode, + &call->expected_version, NULL) < 0) + return afs_protocol_error(call, -EBADMSG); + if (new_dvnode != orig_dvnode && + xdr_decode_AFSFetchStatus(call, &bp, &new_dvnode->status, new_dvnode, + &call->expected_version_2, NULL) < 0) + return afs_protocol_error(call, -EBADMSG); /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ _leave(" = 0 [done]"); @@ -1013,7 +1126,9 @@ static const struct afs_call_type afs_RXFSRename = { int afs_fs_rename(struct afs_fs_cursor *fc, const char *orig_name, struct afs_vnode *new_dvnode, - const char *new_name) + const char *new_name, + u64 current_orig_data_version, + u64 current_new_data_version) { struct afs_vnode *orig_dvnode = fc->vnode; struct afs_call *call; @@ -1041,6 +1156,8 @@ int afs_fs_rename(struct afs_fs_cursor *fc, call->key = fc->key; call->reply[0] = orig_dvnode; call->reply[1] = new_dvnode; + call->expected_version = current_orig_data_version + 1; + call->expected_version_2 = current_new_data_version + 1; /* marshall the parameters */ bp = call->request; @@ -1089,8 +1206,9 @@ static int afs_deliver_fs_store_data(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, - &call->store_version); + if (xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL) < 0) + return afs_protocol_error(call, -EBADMSG); /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ afs_pages_written_back(vnode, call); @@ -1147,7 +1265,7 @@ static int afs_fs_store_data64(struct afs_fs_cursor *fc, call->first_offset = offset; call->last_to = to; call->send_pages = true; - call->store_version = vnode->status.data_version + 1; + call->expected_version = vnode->status.data_version + 1; /* marshall the parameters */ bp = call->request; @@ -1222,7 +1340,7 @@ int afs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping, call->first_offset = offset; call->last_to = to; call->send_pages = true; - call->store_version = vnode->status.data_version + 1; + call->expected_version = vnode->status.data_version + 1; /* marshall the parameters */ bp = call->request; @@ -1252,7 +1370,6 @@ int afs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping, */ static int afs_deliver_fs_store_status(struct afs_call *call) { - afs_dataversion_t *store_version; struct afs_vnode *vnode = call->reply[0]; const __be32 *bp; int ret; @@ -1264,12 +1381,10 @@ static int afs_deliver_fs_store_status(struct afs_call *call) return ret; /* unmarshall the reply once we've received all of it */ - store_version = NULL; - if (call->operation_ID == FSSTOREDATA) - store_version = &call->store_version; - bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, store_version); + if (xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL) < 0) + return afs_protocol_error(call, -EBADMSG); /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ _leave(" = 0 [done]"); @@ -1324,7 +1439,7 @@ static int afs_fs_setattr_size64(struct afs_fs_cursor *fc, struct iattr *attr) call->key = fc->key; call->reply[0] = vnode; - call->store_version = vnode->status.data_version + 1; + call->expected_version = vnode->status.data_version + 1; /* marshall the parameters */ bp = call->request; @@ -1373,7 +1488,7 @@ static int afs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr) call->key = fc->key; call->reply[0] = vnode; - call->store_version = vnode->status.data_version + 1; + call->expected_version = vnode->status.data_version + 1; /* marshall the parameters */ bp = call->request; @@ -1418,6 +1533,7 @@ int afs_fs_setattr(struct afs_fs_cursor *fc, struct iattr *attr) call->key = fc->key; call->reply[0] = vnode; + call->expected_version = vnode->status.data_version; /* marshall the parameters */ bp = call->request; @@ -1471,7 +1587,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) call->count = ntohl(call->tmp); _debug("volname length: %u", call->count); if (call->count >= AFSNAMEMAX) - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); call->offset = 0; call->unmarshall++; @@ -1518,7 +1634,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) call->count = ntohl(call->tmp); _debug("offline msg length: %u", call->count); if (call->count >= AFSNAMEMAX) - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); call->offset = 0; call->unmarshall++; @@ -1565,7 +1681,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) call->count = ntohl(call->tmp); _debug("motd length: %u", call->count); if (call->count >= AFSNAMEMAX) - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); call->offset = 0; call->unmarshall++; @@ -1947,3 +2063,265 @@ int afs_fs_get_capabilities(struct afs_net *net, trace_afs_make_fs_call(call, NULL); return afs_make_call(ac, call, GFP_NOFS, false); } + +/* + * Deliver reply data to an FS.FetchStatus with no vnode. + */ +static int afs_deliver_fs_fetch_status(struct afs_call *call) +{ + struct afs_file_status *status = call->reply[1]; + struct afs_callback *callback = call->reply[2]; + struct afs_volsync *volsync = call->reply[3]; + struct afs_vnode *vnode = call->reply[0]; + const __be32 *bp; + int ret; + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode); + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + xdr_decode_AFSFetchStatus(call, &bp, status, vnode, + &call->expected_version, NULL); + callback[call->count].version = ntohl(bp[0]); + callback[call->count].expiry = ntohl(bp[1]); + callback[call->count].type = ntohl(bp[2]); + if (vnode) + xdr_decode_AFSCallBack(call, vnode, &bp); + else + bp += 3; + if (volsync) + xdr_decode_AFSVolSync(&bp, volsync); + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.FetchStatus operation type + */ +static const struct afs_call_type afs_RXFSFetchStatus = { + .name = "FS.FetchStatus", + .op = afs_FS_FetchStatus, + .deliver = afs_deliver_fs_fetch_status, + .destructor = afs_flat_call_destructor, +}; + +/* + * Fetch the status information for a fid without needing a vnode handle. + */ +int afs_fs_fetch_status(struct afs_fs_cursor *fc, + struct afs_net *net, + struct afs_fid *fid, + struct afs_file_status *status, + struct afs_callback *callback, + struct afs_volsync *volsync) +{ + struct afs_call *call; + __be32 *bp; + + _enter(",%x,{%x:%u},,", + key_serial(fc->key), fid->vid, fid->vnode); + + call = afs_alloc_flat_call(net, &afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4); + if (!call) { + fc->ac.error = -ENOMEM; + return -ENOMEM; + } + + call->key = fc->key; + call->reply[0] = NULL; /* vnode for fid[0] */ + call->reply[1] = status; + call->reply[2] = callback; + call->reply[3] = volsync; + call->expected_version = 1; /* vnode->status.data_version */ + + /* marshall the parameters */ + bp = call->request; + bp[0] = htonl(FSFETCHSTATUS); + bp[1] = htonl(fid->vid); + bp[2] = htonl(fid->vnode); + bp[3] = htonl(fid->unique); + + call->cb_break = fc->cb_break; + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * Deliver reply data to an FS.InlineBulkStatus call + */ +static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) +{ + struct afs_file_status *statuses; + struct afs_callback *callbacks; + struct afs_vnode *vnode = call->reply[0]; + const __be32 *bp; + u32 tmp; + int ret; + + _enter("{%u}", call->unmarshall); + + switch (call->unmarshall) { + case 0: + call->offset = 0; + call->unmarshall++; + + /* Extract the file status count and array in two steps */ + case 1: + _debug("extract status count"); + ret = afs_extract_data(call, &call->tmp, 4, true); + if (ret < 0) + return ret; + + tmp = ntohl(call->tmp); + _debug("status count: %u/%u", tmp, call->count2); + if (tmp != call->count2) + return afs_protocol_error(call, -EBADMSG); + + call->count = 0; + call->unmarshall++; + more_counts: + call->offset = 0; + + case 2: + _debug("extract status array %u", call->count); + ret = afs_extract_data(call, call->buffer, 21 * 4, true); + if (ret < 0) + return ret; + + bp = call->buffer; + statuses = call->reply[1]; + if (xdr_decode_AFSFetchStatus(call, &bp, &statuses[call->count], + call->count == 0 ? vnode : NULL, + NULL, NULL) < 0) + return afs_protocol_error(call, -EBADMSG); + + call->count++; + if (call->count < call->count2) + goto more_counts; + + call->count = 0; + call->unmarshall++; + call->offset = 0; + + /* Extract the callback count and array in two steps */ + case 3: + _debug("extract CB count"); + ret = afs_extract_data(call, &call->tmp, 4, true); + if (ret < 0) + return ret; + + tmp = ntohl(call->tmp); + _debug("CB count: %u", tmp); + if (tmp != call->count2) + return afs_protocol_error(call, -EBADMSG); + call->count = 0; + call->unmarshall++; + more_cbs: + call->offset = 0; + + case 4: + _debug("extract CB array"); + ret = afs_extract_data(call, call->buffer, 3 * 4, true); + if (ret < 0) + return ret; + + _debug("unmarshall CB array"); + bp = call->buffer; + callbacks = call->reply[2]; + callbacks[call->count].version = ntohl(bp[0]); + callbacks[call->count].expiry = ntohl(bp[1]); + callbacks[call->count].type = ntohl(bp[2]); + statuses = call->reply[1]; + if (call->count == 0 && vnode && statuses[0].abort_code == 0) + xdr_decode_AFSCallBack(call, vnode, &bp); + call->count++; + if (call->count < call->count2) + goto more_cbs; + + call->offset = 0; + call->unmarshall++; + + case 5: + ret = afs_extract_data(call, call->buffer, 6 * 4, false); + if (ret < 0) + return ret; + + bp = call->buffer; + if (call->reply[3]) + xdr_decode_AFSVolSync(&bp, call->reply[3]); + + call->offset = 0; + call->unmarshall++; + + case 6: + break; + } + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.InlineBulkStatus operation type + */ +static const struct afs_call_type afs_RXFSInlineBulkStatus = { + .name = "FS.InlineBulkStatus", + .op = afs_FS_InlineBulkStatus, + .deliver = afs_deliver_fs_inline_bulk_status, + .destructor = afs_flat_call_destructor, +}; + +/* + * Fetch the status information for up to 50 files + */ +int afs_fs_inline_bulk_status(struct afs_fs_cursor *fc, + struct afs_net *net, + struct afs_fid *fids, + struct afs_file_status *statuses, + struct afs_callback *callbacks, + unsigned int nr_fids, + struct afs_volsync *volsync) +{ + struct afs_call *call; + __be32 *bp; + int i; + + _enter(",%x,{%x:%u},%u", + key_serial(fc->key), fids[0].vid, fids[1].vnode, nr_fids); + + call = afs_alloc_flat_call(net, &afs_RXFSInlineBulkStatus, + (2 + nr_fids * 3) * 4, + 21 * 4); + if (!call) { + fc->ac.error = -ENOMEM; + return -ENOMEM; + } + + call->key = fc->key; + call->reply[0] = NULL; /* vnode for fid[0] */ + call->reply[1] = statuses; + call->reply[2] = callbacks; + call->reply[3] = volsync; + call->count2 = nr_fids; + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSINLINEBULKSTATUS); + *bp++ = htonl(nr_fids); + for (i = 0; i < nr_fids; i++) { + *bp++ = htonl(fids[i].vid); + *bp++ = htonl(fids[i].vnode); + *bp++ = htonl(fids[i].unique); + } + + call->cb_break = fc->cb_break; + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &fids[0]); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 6b39d0255b72..06194cfe9724 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -30,12 +30,11 @@ static const struct inode_operations afs_symlink_inode_operations = { }; /* - * map the AFS file status to the inode member variables + * Initialise an inode from the vnode status. */ -static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) +static int afs_inode_init_from_status(struct afs_vnode *vnode, struct key *key) { struct inode *inode = AFS_VNODE_TO_I(vnode); - bool changed; _debug("FS: ft=%d lk=%d sz=%llu ver=%Lu mod=%hu", vnode->status.type, @@ -46,16 +45,21 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) read_seqlock_excl(&vnode->cb_lock); + afs_update_inode_from_status(vnode, &vnode->status, NULL, + AFS_VNODE_NOT_YET_SET); + switch (vnode->status.type) { case AFS_FTYPE_FILE: inode->i_mode = S_IFREG | vnode->status.mode; inode->i_op = &afs_file_inode_operations; inode->i_fop = &afs_file_operations; + inode->i_mapping->a_ops = &afs_fs_aops; break; case AFS_FTYPE_DIR: inode->i_mode = S_IFDIR | vnode->status.mode; inode->i_op = &afs_dir_inode_operations; inode->i_fop = &afs_dir_file_operations; + inode->i_mapping->a_ops = &afs_dir_aops; break; case AFS_FTYPE_SYMLINK: /* Symlinks with a mode of 0644 are actually mountpoints. */ @@ -67,45 +71,31 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) inode->i_mode = S_IFDIR | 0555; inode->i_op = &afs_mntpt_inode_operations; inode->i_fop = &afs_mntpt_file_operations; + inode->i_mapping->a_ops = &afs_fs_aops; } else { inode->i_mode = S_IFLNK | vnode->status.mode; inode->i_op = &afs_symlink_inode_operations; + inode->i_mapping->a_ops = &afs_fs_aops; } inode_nohighmem(inode); break; default: printk("kAFS: AFS vnode with undefined type\n"); read_sequnlock_excl(&vnode->cb_lock); - return -EBADMSG; + return afs_protocol_error(NULL, -EBADMSG); } - changed = (vnode->status.size != inode->i_size); - - set_nlink(inode, vnode->status.nlink); - inode->i_uid = vnode->status.owner; - inode->i_gid = vnode->status.group; - inode->i_size = vnode->status.size; - inode->i_ctime.tv_sec = vnode->status.mtime_client; - inode->i_ctime.tv_nsec = 0; - inode->i_atime = inode->i_mtime = inode->i_ctime; inode->i_blocks = 0; - inode->i_generation = vnode->fid.unique; - inode_set_iversion_raw(inode, vnode->status.data_version); - inode->i_mapping->a_ops = &afs_fs_aops; + vnode->invalid_before = vnode->status.data_version; read_sequnlock_excl(&vnode->cb_lock); - -#ifdef CONFIG_AFS_FSCACHE - if (changed) - fscache_attr_changed(vnode->cache); -#endif return 0; } /* * Fetch file status from the volume. */ -int afs_fetch_status(struct afs_vnode *vnode, struct key *key) +int afs_fetch_status(struct afs_vnode *vnode, struct key *key, bool new_inode) { struct afs_fs_cursor fc; int ret; @@ -119,7 +109,7 @@ int afs_fetch_status(struct afs_vnode *vnode, struct key *key) if (afs_begin_vnode_operation(&fc, vnode, key)) { while (afs_select_fileserver(&fc)) { fc.cb_break = vnode->cb_break + vnode->cb_s_break; - afs_fs_fetch_file_status(&fc, NULL); + afs_fs_fetch_file_status(&fc, NULL, new_inode); } afs_check_for_remote_deletion(&fc, fc.vnode); @@ -243,6 +233,38 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) } /* + * Get a cache cookie for an inode. + */ +static void afs_get_inode_cache(struct afs_vnode *vnode) +{ +#ifdef CONFIG_AFS_FSCACHE + struct { + u32 vnode_id; + u32 unique; + u32 vnode_id_ext[2]; /* Allow for a 96-bit key */ + } __packed key; + struct afs_vnode_cache_aux aux; + + if (vnode->status.type == AFS_FTYPE_DIR) { + vnode->cache = NULL; + return; + } + + key.vnode_id = vnode->fid.vnode; + key.unique = vnode->fid.unique; + key.vnode_id_ext[0] = 0; + key.vnode_id_ext[1] = 0; + aux.data_version = vnode->status.data_version; + + vnode->cache = fscache_acquire_cookie(vnode->volume->cache, + &afs_vnode_cache_index_def, + &key, sizeof(key), + &aux, sizeof(aux), + vnode, vnode->status.size, true); +#endif +} + +/* * inode retrieval */ struct inode *afs_iget(struct super_block *sb, struct key *key, @@ -280,7 +302,7 @@ struct inode *afs_iget(struct super_block *sb, struct key *key, if (!status) { /* it's a remotely extant inode */ - ret = afs_fetch_status(vnode, key); + ret = afs_fetch_status(vnode, key, true); if (ret < 0) goto bad_inode; } else { @@ -304,19 +326,12 @@ struct inode *afs_iget(struct super_block *sb, struct key *key, vnode->cb_expires_at += ktime_get_real_seconds(); } - /* set up caching before mapping the status, as map-status reads the - * first page of symlinks to see if they're really mountpoints */ - inode->i_size = vnode->status.size; -#ifdef CONFIG_AFS_FSCACHE - vnode->cache = fscache_acquire_cookie(vnode->volume->cache, - &afs_vnode_cache_index_def, - vnode, true); -#endif - - ret = afs_inode_map_status(vnode, key); + ret = afs_inode_init_from_status(vnode, key); if (ret < 0) goto bad_inode; + afs_get_inode_cache(vnode); + /* success */ clear_bit(AFS_VNODE_UNSET, &vnode->flags); inode->i_flags |= S_NOATIME; @@ -326,10 +341,6 @@ struct inode *afs_iget(struct super_block *sb, struct key *key, /* failure */ bad_inode: -#ifdef CONFIG_AFS_FSCACHE - fscache_relinquish_cookie(vnode->cache, 0); - vnode->cache = NULL; -#endif iget_failed(inode); _leave(" = %d [bad]", ret); return ERR_PTR(ret); @@ -343,6 +354,10 @@ void afs_zap_data(struct afs_vnode *vnode) { _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode); +#ifdef CONFIG_AFS_FSCACHE + fscache_invalidate(vnode->cache); +#endif + /* nuke all the non-dirty pages that aren't locked, mapped or being * written back in a regular file and completely discard the pages in a * directory or symlink */ @@ -380,8 +395,11 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { if (vnode->cb_s_break != vnode->cb_interest->server->cb_s_break) { vnode->cb_s_break = vnode->cb_interest->server->cb_s_break; - } else if (!test_bit(AFS_VNODE_DIR_MODIFIED, &vnode->flags) && - !test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) && + } else if (vnode->status.type == AFS_FTYPE_DIR && + test_bit(AFS_VNODE_DIR_VALID, &vnode->flags) && + vnode->cb_expires_at - 10 > now) { + valid = true; + } else if (!test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) && vnode->cb_expires_at - 10 > now) { valid = true; } @@ -405,7 +423,7 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) * access */ if (!test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { _debug("not promised"); - ret = afs_fetch_status(vnode, key); + ret = afs_fetch_status(vnode, key, false); if (ret < 0) { if (ret == -ENOENT) { set_bit(AFS_VNODE_DELETED, &vnode->flags); @@ -426,8 +444,6 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) * different */ if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) afs_zap_data(vnode); - - clear_bit(AFS_VNODE_DIR_MODIFIED, &vnode->flags); mutex_unlock(&vnode->validate_lock); valid: _leave(" = 0"); @@ -507,11 +523,17 @@ void afs_evict_inode(struct inode *inode) } #ifdef CONFIG_AFS_FSCACHE - fscache_relinquish_cookie(vnode->cache, 0); - vnode->cache = NULL; + { + struct afs_vnode_cache_aux aux; + + aux.data_version = vnode->status.data_version; + fscache_relinquish_cookie(vnode->cache, &aux, + test_bit(AFS_VNODE_DELETED, &vnode->flags)); + vnode->cache = NULL; + } #endif - afs_put_permits(vnode->permit_cache); + afs_put_permits(rcu_access_pointer(vnode->permit_cache)); _leave(""); } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 72217170b155..f8086ec95e24 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -122,7 +122,8 @@ struct afs_call { u32 operation_ID; /* operation ID for an incoming call */ u32 count; /* count for use in unmarshalling */ __be32 tmp; /* place to extract temporary data */ - afs_dataversion_t store_version; /* updated version expected from store */ + afs_dataversion_t expected_version; /* Updated version expected from store */ + afs_dataversion_t expected_version_2; /* 2nd updated version expected from store */ }; struct afs_call_type { @@ -173,11 +174,14 @@ struct afs_read { loff_t len; /* How much we're asking for */ loff_t actual_len; /* How much we're actually getting */ loff_t remain; /* Amount remaining */ - atomic_t usage; + loff_t file_size; /* File size returned by server */ + afs_dataversion_t data_version; /* Version number returned by server */ + refcount_t usage; unsigned int index; /* Which page we're reading into */ unsigned int nr_pages; void (*page_done)(struct afs_call *, struct afs_read *); - struct page *pages[]; + struct page **pages; + struct page *array[]; }; /* @@ -199,6 +203,18 @@ static inline struct afs_super_info *AFS_FS_S(struct super_block *sb) extern struct file_system_type afs_fs_type; /* + * Set of substitutes for @sys. + */ +struct afs_sysnames { +#define AFS_NR_SYSNAME 16 + char *subs[AFS_NR_SYSNAME]; + refcount_t usage; + unsigned short nr; + short error; + char blank[1]; +}; + +/* * AFS network namespace record. */ struct afs_net { @@ -245,9 +261,25 @@ struct afs_net { struct mutex lock_manager_mutex; /* Misc */ - struct proc_dir_entry *proc_afs; /* /proc/net/afs directory */ + struct proc_dir_entry *proc_afs; /* /proc/net/afs directory */ + struct afs_sysnames *sysnames; + rwlock_t sysnames_lock; + + /* Statistics counters */ + atomic_t n_lookup; /* Number of lookups done */ + atomic_t n_reval; /* Number of dentries needing revalidation */ + atomic_t n_inval; /* Number of invalidations by the server */ + atomic_t n_relpg; /* Number of invalidations by releasepage */ + atomic_t n_read_dir; /* Number of directory pages read */ + atomic_t n_dir_cr; /* Number of directory entry creation edits */ + atomic_t n_dir_rm; /* Number of directory entry removal edits */ + atomic_t n_stores; /* Number of store ops */ + atomic_long_t n_store_bytes; /* Number of bytes stored */ + atomic_long_t n_fetch_bytes; /* Number of bytes fetched */ + atomic_t n_fetches; /* Number of data fetch ops */ }; +extern const char afs_init_sysname[]; extern struct afs_net __afs_net;// Dummy AFS network namespace; TODO: replace with real netns enum afs_cell_state { @@ -363,6 +395,7 @@ struct afs_server { #define AFS_SERVER_FL_UPDATING 4 #define AFS_SERVER_FL_PROBED 5 /* The fileserver has been probed */ #define AFS_SERVER_FL_PROBING 6 /* Fileserver is being probed */ +#define AFS_SERVER_FL_NO_IBULK 7 /* Fileserver doesn't support FS.InlineBulkStatus */ atomic_t usage; u32 addr_version; /* Address list version */ @@ -455,10 +488,11 @@ struct afs_vnode { struct afs_volume *volume; /* volume on which vnode resides */ struct afs_fid fid; /* the file identifier for this inode */ struct afs_file_status status; /* AFS status info for this file */ + afs_dataversion_t invalid_before; /* Child dentries are invalid before this */ #ifdef CONFIG_AFS_FSCACHE struct fscache_cookie *cache; /* caching cookie */ #endif - struct afs_permits *permit_cache; /* cache of permits so far obtained */ + struct afs_permits __rcu *permit_cache; /* cache of permits so far obtained */ struct mutex io_lock; /* Lock for serialising I/O on this mutex */ struct mutex validate_lock; /* lock for validating this vnode */ spinlock_t wb_lock; /* lock for wb_keys */ @@ -466,12 +500,13 @@ struct afs_vnode { unsigned long flags; #define AFS_VNODE_CB_PROMISED 0 /* Set if vnode has a callback promise */ #define AFS_VNODE_UNSET 1 /* set if vnode attributes not yet set */ -#define AFS_VNODE_DIR_MODIFIED 2 /* set if dir vnode's data modified */ +#define AFS_VNODE_DIR_VALID 2 /* Set if dir contents are valid */ #define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */ #define AFS_VNODE_DELETED 4 /* set if vnode deleted on server */ #define AFS_VNODE_MOUNTPOINT 5 /* set if vnode is a mountpoint symlink */ #define AFS_VNODE_AUTOCELL 6 /* set if Vnode is an auto mount point */ #define AFS_VNODE_PSEUDODIR 7 /* set if Vnode is a pseudo directory */ +#define AFS_VNODE_NEW_CONTENT 8 /* Set if file has new content (create/trunc-0) */ struct list_head wb_keys; /* List of keys available for writeback */ struct list_head pending_locks; /* locks waiting to be granted */ @@ -559,6 +594,13 @@ struct afs_fs_cursor { #define AFS_FS_CURSOR_NO_VSLEEP 0x0020 /* Set to prevent sleep on VBUSY, VOFFLINE, ... */ }; +/* + * Cache auxiliary data. + */ +struct afs_vnode_cache_aux { + u64 data_version; +} __packed; + #include <trace/events/afs.h> /*****************************************************************************/ @@ -604,7 +646,7 @@ extern struct fscache_cookie_def afs_vnode_cache_index_def; */ extern void afs_init_callback_state(struct afs_server *); extern void afs_break_callback(struct afs_vnode *); -extern void afs_break_callbacks(struct afs_server *, size_t,struct afs_callback[]); +extern void afs_break_callbacks(struct afs_server *, size_t, struct afs_callback_break*); extern int afs_register_server_cb_interest(struct afs_vnode *, struct afs_server_entry *); extern void afs_put_cb_interest(struct afs_net *, struct afs_cb_interest *); @@ -639,11 +681,26 @@ extern bool afs_cm_incoming_call(struct afs_call *); */ extern const struct file_operations afs_dir_file_operations; extern const struct inode_operations afs_dir_inode_operations; +extern const struct address_space_operations afs_dir_aops; +extern const struct dentry_operations afs_fs_dentry_operations; + +extern void afs_d_release(struct dentry *); + +/* + * dir_edit.c + */ +extern void afs_edit_dir_add(struct afs_vnode *, struct qstr *, struct afs_fid *, + enum afs_edit_dir_reason); +extern void afs_edit_dir_remove(struct afs_vnode *, struct qstr *, enum afs_edit_dir_reason); + +/* + * dynroot.c + */ extern const struct file_operations afs_dynroot_file_operations; extern const struct inode_operations afs_dynroot_inode_operations; -extern const struct dentry_operations afs_fs_dentry_operations; +extern const struct dentry_operations afs_dynroot_dentry_operations; -extern bool afs_dir_check_page(struct inode *, struct page *); +extern struct inode *afs_try_auto_mntpt(struct dentry *, struct inode *); /* * file.c @@ -673,17 +730,23 @@ extern int afs_flock(struct file *, int, struct file_lock *); /* * fsclient.c */ -extern int afs_fs_fetch_file_status(struct afs_fs_cursor *, struct afs_volsync *); +#define AFS_VNODE_NOT_YET_SET 0x01 +#define AFS_VNODE_META_CHANGED 0x02 +#define AFS_VNODE_DATA_CHANGED 0x04 +extern void afs_update_inode_from_status(struct afs_vnode *, struct afs_file_status *, + const afs_dataversion_t *, u8); + +extern int afs_fs_fetch_file_status(struct afs_fs_cursor *, struct afs_volsync *, bool); extern int afs_fs_give_up_callbacks(struct afs_net *, struct afs_server *); extern int afs_fs_fetch_data(struct afs_fs_cursor *, struct afs_read *); -extern int afs_fs_create(struct afs_fs_cursor *, const char *, umode_t, +extern int afs_fs_create(struct afs_fs_cursor *, const char *, umode_t, u64, struct afs_fid *, struct afs_file_status *, struct afs_callback *); -extern int afs_fs_remove(struct afs_fs_cursor *, const char *, bool); -extern int afs_fs_link(struct afs_fs_cursor *, struct afs_vnode *, const char *); -extern int afs_fs_symlink(struct afs_fs_cursor *, const char *, const char *, +extern int afs_fs_remove(struct afs_fs_cursor *, const char *, bool, u64); +extern int afs_fs_link(struct afs_fs_cursor *, struct afs_vnode *, const char *, u64); +extern int afs_fs_symlink(struct afs_fs_cursor *, const char *, const char *, u64, struct afs_fid *, struct afs_file_status *); extern int afs_fs_rename(struct afs_fs_cursor *, const char *, - struct afs_vnode *, const char *); + struct afs_vnode *, const char *, u64, u64); extern int afs_fs_store_data(struct afs_fs_cursor *, struct address_space *, pgoff_t, pgoff_t, unsigned, unsigned); extern int afs_fs_setattr(struct afs_fs_cursor *, struct iattr *); @@ -695,11 +758,18 @@ extern int afs_fs_give_up_all_callbacks(struct afs_net *, struct afs_server *, struct afs_addr_cursor *, struct key *); extern int afs_fs_get_capabilities(struct afs_net *, struct afs_server *, struct afs_addr_cursor *, struct key *); +extern int afs_fs_inline_bulk_status(struct afs_fs_cursor *, struct afs_net *, + struct afs_fid *, struct afs_file_status *, + struct afs_callback *, unsigned int, + struct afs_volsync *); +extern int afs_fs_fetch_status(struct afs_fs_cursor *, struct afs_net *, + struct afs_fid *, struct afs_file_status *, + struct afs_callback *, struct afs_volsync *); /* * inode.c */ -extern int afs_fetch_status(struct afs_vnode *, struct key *); +extern int afs_fetch_status(struct afs_vnode *, struct key *, bool); extern int afs_iget5_test(struct inode *, void *); extern struct inode *afs_iget_pseudo_dir(struct super_block *, bool); extern struct inode *afs_iget(struct super_block *, struct key *, @@ -747,6 +817,13 @@ static inline void afs_put_net(struct afs_net *net) { } +static inline void __afs_stat(atomic_t *s) +{ + atomic_inc(s); +} + +#define afs_stat_v(vnode, n) __afs_stat(&afs_v2net(vnode)->n) + /* * misc.c */ @@ -774,6 +851,7 @@ extern int __net_init afs_proc_init(struct afs_net *); extern void __net_exit afs_proc_cleanup(struct afs_net *); extern int afs_proc_cell_setup(struct afs_net *, struct afs_cell *); extern void afs_proc_cell_remove(struct afs_net *, struct afs_cell *); +extern void afs_put_sysnames(struct afs_sysnames *); /* * rotate.c @@ -802,6 +880,7 @@ extern void afs_flat_call_destructor(struct afs_call *); extern void afs_send_empty_reply(struct afs_call *); extern void afs_send_simple_reply(struct afs_call *, const void *, size_t); extern int afs_extract_data(struct afs_call *, void *, size_t, bool); +extern int afs_protocol_error(struct afs_call *, int); static inline int afs_transfer_reply(struct afs_call *call) { @@ -948,7 +1027,6 @@ extern int afs_writepage(struct page *, struct writeback_control *); extern int afs_writepages(struct address_space *, struct writeback_control *); extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *); extern ssize_t afs_file_write(struct kiocb *, struct iov_iter *); -extern int afs_flush(struct file *, fl_owner_t); extern int afs_fsync(struct file *, loff_t, loff_t, int); extern int afs_page_mkwrite(struct vm_fault *); extern void afs_prune_wb_keys(struct afs_vnode *); diff --git a/fs/afs/main.c b/fs/afs/main.c index 15a02a05ff40..d7560168b3bf 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -34,11 +34,42 @@ MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list"); struct workqueue_struct *afs_wq; struct afs_net __afs_net; +#if defined(CONFIG_ALPHA) +const char afs_init_sysname[] = "alpha_linux26"; +#elif defined(CONFIG_X86_64) +const char afs_init_sysname[] = "amd64_linux26"; +#elif defined(CONFIG_ARM) +const char afs_init_sysname[] = "arm_linux26"; +#elif defined(CONFIG_ARM64) +const char afs_init_sysname[] = "aarch64_linux26"; +#elif defined(CONFIG_X86_32) +const char afs_init_sysname[] = "i386_linux26"; +#elif defined(CONFIG_IA64) +const char afs_init_sysname[] = "ia64_linux26"; +#elif defined(CONFIG_PPC64) +const char afs_init_sysname[] = "ppc64_linux26"; +#elif defined(CONFIG_PPC32) +const char afs_init_sysname[] = "ppc_linux26"; +#elif defined(CONFIG_S390) +#ifdef CONFIG_64BIT +const char afs_init_sysname[] = "s390x_linux26"; +#else +const char afs_init_sysname[] = "s390_linux26"; +#endif +#elif defined(CONFIG_SPARC64) +const char afs_init_sysname[] = "sparc64_linux26"; +#elif defined(CONFIG_SPARC32) +const char afs_init_sysname[] = "sparc_linux26"; +#else +const char afs_init_sysname[] = "unknown_linux26"; +#endif + /* * Initialise an AFS network namespace record. */ static int __net_init afs_net_init(struct afs_net *net) { + struct afs_sysnames *sysnames; int ret; net->live = true; @@ -67,6 +98,16 @@ static int __net_init afs_net_init(struct afs_net *net) INIT_WORK(&net->fs_manager, afs_manage_servers); timer_setup(&net->fs_timer, afs_servers_timer, 0); + ret = -ENOMEM; + sysnames = kzalloc(sizeof(*sysnames), GFP_KERNEL); + if (!sysnames) + goto error_sysnames; + sysnames->subs[0] = (char *)&afs_init_sysname; + sysnames->nr = 1; + refcount_set(&sysnames->usage, 1); + net->sysnames = sysnames; + rwlock_init(&net->sysnames_lock); + /* Register the /proc stuff */ ret = afs_proc_init(net); if (ret < 0) @@ -92,6 +133,8 @@ error_cell_init: net->live = false; afs_proc_cleanup(net); error_proc: + afs_put_sysnames(net->sysnames); +error_sysnames: net->live = false; return ret; } @@ -106,6 +149,7 @@ static void __net_exit afs_net_exit(struct afs_net *net) afs_purge_servers(net); afs_close_socket(net); afs_proc_cleanup(net); + afs_put_sysnames(net->sysnames); } /* diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 4508dd54f789..839a22280606 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -126,6 +126,34 @@ static const struct file_operations afs_proc_servers_fops = { .release = seq_release, }; +static int afs_proc_sysname_open(struct inode *inode, struct file *file); +static int afs_proc_sysname_release(struct inode *inode, struct file *file); +static void *afs_proc_sysname_start(struct seq_file *p, loff_t *pos); +static void *afs_proc_sysname_next(struct seq_file *p, void *v, + loff_t *pos); +static void afs_proc_sysname_stop(struct seq_file *p, void *v); +static int afs_proc_sysname_show(struct seq_file *m, void *v); +static ssize_t afs_proc_sysname_write(struct file *file, + const char __user *buf, + size_t size, loff_t *_pos); + +static const struct seq_operations afs_proc_sysname_ops = { + .start = afs_proc_sysname_start, + .next = afs_proc_sysname_next, + .stop = afs_proc_sysname_stop, + .show = afs_proc_sysname_show, +}; + +static const struct file_operations afs_proc_sysname_fops = { + .open = afs_proc_sysname_open, + .read = seq_read, + .llseek = seq_lseek, + .release = afs_proc_sysname_release, + .write = afs_proc_sysname_write, +}; + +static const struct file_operations afs_proc_stats_fops; + /* * initialise the /proc/fs/afs/ directory */ @@ -139,7 +167,9 @@ int afs_proc_init(struct afs_net *net) if (!proc_create("cells", 0644, net->proc_afs, &afs_proc_cells_fops) || !proc_create("rootcell", 0644, net->proc_afs, &afs_proc_rootcell_fops) || - !proc_create("servers", 0644, net->proc_afs, &afs_proc_servers_fops)) + !proc_create("servers", 0644, net->proc_afs, &afs_proc_servers_fops) || + !proc_create("stats", 0644, net->proc_afs, &afs_proc_stats_fops) || + !proc_create("sysname", 0644, net->proc_afs, &afs_proc_sysname_fops)) goto error_tree; _leave(" = 0"); @@ -183,6 +213,7 @@ static int afs_proc_cells_open(struct inode *inode, struct file *file) * first item */ static void *afs_proc_cells_start(struct seq_file *m, loff_t *_pos) + __acquires(rcu) { struct afs_net *net = afs_seq2net(m); @@ -204,6 +235,7 @@ static void *afs_proc_cells_next(struct seq_file *m, void *v, loff_t *pos) * clean up after reading from the cells list */ static void afs_proc_cells_stop(struct seq_file *m, void *v) + __releases(rcu) { rcu_read_unlock(); } @@ -282,7 +314,8 @@ static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf, goto done; } - set_bit(AFS_CELL_FL_NO_GC, &cell->flags); + if (test_and_set_bit(AFS_CELL_FL_NO_GC, &cell->flags)) + afs_put_cell(net, cell); printk("kAFS: Added new cell '%s'\n", name); } else { goto inval; @@ -304,7 +337,40 @@ inval: static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf, size_t size, loff_t *_pos) { - return 0; + struct afs_cell *cell; + struct afs_net *net = afs_proc2net(file); + unsigned int seq = 0; + char name[AFS_MAXCELLNAME + 1]; + int len; + + if (*_pos > 0) + return 0; + if (!net->ws_cell) + return 0; + + rcu_read_lock(); + do { + read_seqbegin_or_lock(&net->cells_lock, &seq); + len = 0; + cell = rcu_dereference_raw(net->ws_cell); + if (cell) { + len = cell->name_len; + memcpy(name, cell->name, len); + } + } while (need_seqretry(&net->cells_lock, seq)); + done_seqretry(&net->cells_lock, seq); + rcu_read_unlock(); + + if (!len) + return 0; + + name[len++] = '\n'; + if (len > size) + len = size; + if (copy_to_user(buf, name, len) != 0) + return -EFAULT; + *_pos = 1; + return len; } /* @@ -327,6 +393,12 @@ static ssize_t afs_proc_rootcell_write(struct file *file, if (IS_ERR(kbuf)) return PTR_ERR(kbuf); + ret = -EINVAL; + if (kbuf[0] == '.') + goto out; + if (memchr(kbuf, '/', size)) + goto out; + /* trim to first NL */ s = memchr(kbuf, '\n', size); if (s) @@ -339,6 +411,7 @@ static ssize_t afs_proc_rootcell_write(struct file *file, if (ret >= 0) ret = size; /* consume everything, always */ +out: kfree(kbuf); _leave(" = %d", ret); return ret; @@ -413,6 +486,7 @@ static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file) * first item */ static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos) + __acquires(cell->proc_lock) { struct afs_cell *cell = m->private; @@ -438,6 +512,7 @@ static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v, * clean up after reading from the cells list */ static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v) + __releases(cell->proc_lock) { struct afs_cell *cell = p->private; @@ -500,6 +575,7 @@ static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file) * first item */ static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos) + __acquires(rcu) { struct afs_addr_list *alist; struct afs_cell *cell = m->private; @@ -544,6 +620,7 @@ static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v, * clean up after reading from the cells list */ static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v) + __releases(rcu) { rcu_read_unlock(); } @@ -580,6 +657,7 @@ static int afs_proc_servers_open(struct inode *inode, struct file *file) * first item. */ static void *afs_proc_servers_start(struct seq_file *m, loff_t *_pos) + __acquires(rcu) { struct afs_net *net = afs_seq2net(m); @@ -601,6 +679,7 @@ static void *afs_proc_servers_next(struct seq_file *m, void *v, loff_t *_pos) * clean up after reading from the cells list */ static void afs_proc_servers_stop(struct seq_file *p, void *v) + __releases(rcu) { rcu_read_unlock(); } @@ -626,3 +705,244 @@ static int afs_proc_servers_show(struct seq_file *m, void *v) &alist->addrs[alist->index].transport); return 0; } + +void afs_put_sysnames(struct afs_sysnames *sysnames) +{ + int i; + + if (sysnames && refcount_dec_and_test(&sysnames->usage)) { + for (i = 0; i < sysnames->nr; i++) + if (sysnames->subs[i] != afs_init_sysname && + sysnames->subs[i] != sysnames->blank) + kfree(sysnames->subs[i]); + } +} + +/* + * Handle opening of /proc/fs/afs/sysname. If it is opened for writing, we + * assume the caller wants to change the substitution list and we allocate a + * buffer to hold the list. + */ +static int afs_proc_sysname_open(struct inode *inode, struct file *file) +{ + struct afs_sysnames *sysnames; + struct seq_file *m; + int ret; + + ret = seq_open(file, &afs_proc_sysname_ops); + if (ret < 0) + return ret; + + if (file->f_mode & FMODE_WRITE) { + sysnames = kzalloc(sizeof(*sysnames), GFP_KERNEL); + if (!sysnames) { + seq_release(inode, file); + return -ENOMEM; + } + + refcount_set(&sysnames->usage, 1); + m = file->private_data; + m->private = sysnames; + } + + return 0; +} + +/* + * Handle writes to /proc/fs/afs/sysname to set the @sys substitution. + */ +static ssize_t afs_proc_sysname_write(struct file *file, + const char __user *buf, + size_t size, loff_t *_pos) +{ + struct afs_sysnames *sysnames; + struct seq_file *m = file->private_data; + char *kbuf = NULL, *s, *p, *sub; + int ret, len; + + sysnames = m->private; + if (!sysnames) + return -EINVAL; + if (sysnames->error) + return sysnames->error; + + if (size >= PAGE_SIZE - 1) { + sysnames->error = -EINVAL; + return -EINVAL; + } + if (size == 0) + return 0; + + kbuf = memdup_user_nul(buf, size); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + inode_lock(file_inode(file)); + + p = kbuf; + while ((s = strsep(&p, " \t\n"))) { + len = strlen(s); + if (len == 0) + continue; + ret = -ENAMETOOLONG; + if (len >= AFSNAMEMAX) + goto error; + + if (len >= 4 && + s[len - 4] == '@' && + s[len - 3] == 's' && + s[len - 2] == 'y' && + s[len - 1] == 's') + /* Protect against recursion */ + goto invalid; + + if (s[0] == '.' && + (len < 2 || (len == 2 && s[1] == '.'))) + goto invalid; + + if (memchr(s, '/', len)) + goto invalid; + + ret = -EFBIG; + if (sysnames->nr >= AFS_NR_SYSNAME) + goto out; + + if (strcmp(s, afs_init_sysname) == 0) { + sub = (char *)afs_init_sysname; + } else { + ret = -ENOMEM; + sub = kmemdup(s, len + 1, GFP_KERNEL); + if (!sub) + goto out; + } + + sysnames->subs[sysnames->nr] = sub; + sysnames->nr++; + } + + ret = size; /* consume everything, always */ +out: + inode_unlock(file_inode(file)); + kfree(kbuf); + return ret; + +invalid: + ret = -EINVAL; +error: + sysnames->error = ret; + goto out; +} + +static int afs_proc_sysname_release(struct inode *inode, struct file *file) +{ + struct afs_sysnames *sysnames, *kill = NULL; + struct seq_file *m = file->private_data; + struct afs_net *net = afs_seq2net(m); + + sysnames = m->private; + if (sysnames) { + if (!sysnames->error) { + kill = sysnames; + if (sysnames->nr == 0) { + sysnames->subs[0] = sysnames->blank; + sysnames->nr++; + } + write_lock(&net->sysnames_lock); + kill = net->sysnames; + net->sysnames = sysnames; + write_unlock(&net->sysnames_lock); + } + afs_put_sysnames(kill); + } + + return seq_release(inode, file); +} + +static void *afs_proc_sysname_start(struct seq_file *m, loff_t *pos) + __acquires(&net->sysnames_lock) +{ + struct afs_net *net = afs_seq2net(m); + struct afs_sysnames *names = net->sysnames; + + read_lock(&net->sysnames_lock); + + if (*pos >= names->nr) + return NULL; + return (void *)(unsigned long)(*pos + 1); +} + +static void *afs_proc_sysname_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct afs_net *net = afs_seq2net(m); + struct afs_sysnames *names = net->sysnames; + + *pos += 1; + if (*pos >= names->nr) + return NULL; + return (void *)(unsigned long)(*pos + 1); +} + +static void afs_proc_sysname_stop(struct seq_file *m, void *v) + __releases(&net->sysnames_lock) +{ + struct afs_net *net = afs_seq2net(m); + + read_unlock(&net->sysnames_lock); +} + +static int afs_proc_sysname_show(struct seq_file *m, void *v) +{ + struct afs_net *net = afs_seq2net(m); + struct afs_sysnames *sysnames = net->sysnames; + unsigned int i = (unsigned long)v - 1; + + if (i < sysnames->nr) + seq_printf(m, "%s\n", sysnames->subs[i]); + return 0; +} + +/* + * Display general per-net namespace statistics + */ +static int afs_proc_stats_show(struct seq_file *m, void *v) +{ + struct afs_net *net = afs_seq2net(m); + + seq_puts(m, "kAFS statistics\n"); + + seq_printf(m, "dir-mgmt: look=%u reval=%u inval=%u relpg=%u\n", + atomic_read(&net->n_lookup), + atomic_read(&net->n_reval), + atomic_read(&net->n_inval), + atomic_read(&net->n_relpg)); + + seq_printf(m, "dir-data: rdpg=%u\n", + atomic_read(&net->n_read_dir)); + + seq_printf(m, "dir-edit: cr=%u rm=%u\n", + atomic_read(&net->n_dir_cr), + atomic_read(&net->n_dir_rm)); + + seq_printf(m, "file-rd : n=%u nb=%lu\n", + atomic_read(&net->n_fetches), + atomic_long_read(&net->n_fetch_bytes)); + seq_printf(m, "file-wr : n=%u nb=%lu\n", + atomic_read(&net->n_stores), + atomic_long_read(&net->n_store_bytes)); + return 0; +} + +/* + * Open "/proc/fs/afs/stats" to allow reading of the stat counters. + */ +static int afs_proc_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, afs_proc_stats_show, NULL); +} + +static const struct file_operations afs_proc_stats_fops = { + .open = afs_proc_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index ad1328d85526..ac0feac9d746 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -21,7 +21,7 @@ /* * Initialise a filesystem server cursor for iterating over FS servers. */ -void afs_init_fs_cursor(struct afs_fs_cursor *fc, struct afs_vnode *vnode) +static void afs_init_fs_cursor(struct afs_fs_cursor *fc, struct afs_vnode *vnode) { memset(fc, 0, sizeof(*fc)); } diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index f7ae54b6a393..5c6263972ec9 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -926,3 +926,12 @@ int afs_extract_data(struct afs_call *call, void *buf, size_t count, afs_set_call_complete(call, ret, remote_abort); return ret; } + +/* + * Log protocol error production. + */ +noinline int afs_protocol_error(struct afs_call *call, int error) +{ + trace_afs_protocol_error(call, error, __builtin_return_address(0)); + return error; +} diff --git a/fs/afs/security.c b/fs/afs/security.c index b88b7d45fdaa..cea2fff313dc 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -178,18 +178,14 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, } } - if (cb_break != (vnode->cb_break + vnode->cb_interest->server->cb_s_break)) { - rcu_read_unlock(); + if (cb_break != (vnode->cb_break + vnode->cb_interest->server->cb_s_break)) goto someone_else_changed_it; - } /* We need a ref on any permits list we want to copy as we'll have to * drop the lock to do memory allocation. */ - if (permits && !refcount_inc_not_zero(&permits->usage)) { - rcu_read_unlock(); + if (permits && !refcount_inc_not_zero(&permits->usage)) goto someone_else_changed_it; - } rcu_read_unlock(); @@ -278,6 +274,7 @@ someone_else_changed_it: /* Someone else changed the cache under us - don't recheck at this * time. */ + rcu_read_unlock(); return; } @@ -296,8 +293,6 @@ int afs_check_permit(struct afs_vnode *vnode, struct key *key, _enter("{%x:%u},%x", vnode->fid.vid, vnode->fid.vnode, key_serial(key)); - permits = vnode->permit_cache; - /* check the permits to see if we've got one yet */ if (key == vnode->volume->cell->anonymous_key) { _debug("anon"); @@ -327,7 +322,7 @@ int afs_check_permit(struct afs_vnode *vnode, struct key *key, */ _debug("no valid permit"); - ret = afs_fetch_status(vnode, key); + ret = afs_fetch_status(vnode, key, false); if (ret < 0) { *_access = 0; _leave(" = %d", ret); diff --git a/fs/afs/server.c b/fs/afs/server.c index a43ef77dabae..e23be63998a8 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -59,7 +59,8 @@ struct afs_server *afs_find_server(struct afs_net *net, alist = rcu_dereference(server->addresses); for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) { b = &alist->addrs[i].transport.sin6; - diff = (u16)a->sin6_port - (u16)b->sin6_port; + diff = ((u16 __force)a->sin6_port - + (u16 __force)b->sin6_port); if (diff == 0) diff = memcmp(&a->sin6_addr, &b->sin6_addr, @@ -79,10 +80,11 @@ struct afs_server *afs_find_server(struct afs_net *net, alist = rcu_dereference(server->addresses); for (i = 0; i < alist->nr_ipv4; i++) { b = &alist->addrs[i].transport.sin6; - diff = (u16)a->sin6_port - (u16)b->sin6_port; + diff = ((u16 __force)a->sin6_port - + (u16 __force)b->sin6_port); if (diff == 0) - diff = ((u32)a->sin6_addr.s6_addr32[3] - - (u32)b->sin6_addr.s6_addr32[3]); + diff = ((u32 __force)a->sin6_addr.s6_addr32[3] - + (u32 __force)b->sin6_addr.s6_addr32[3]); if (diff == 0) goto found; if (diff < 0) { @@ -381,7 +383,7 @@ static void afs_server_rcu(struct rcu_head *rcu) { struct afs_server *server = container_of(rcu, struct afs_server, rcu); - afs_put_addrlist(server->addresses); + afs_put_addrlist(rcu_access_pointer(server->addresses)); kfree(server); } @@ -390,7 +392,7 @@ static void afs_server_rcu(struct rcu_head *rcu) */ static void afs_destroy_server(struct afs_net *net, struct afs_server *server) { - struct afs_addr_list *alist = server->addresses; + struct afs_addr_list *alist = rcu_access_pointer(server->addresses); struct afs_addr_cursor ac = { .alist = alist, .addr = &alist->addrs[0], diff --git a/fs/afs/super.c b/fs/afs/super.c index 3623c952b6ff..65081ec3c36e 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -154,7 +154,7 @@ static int afs_show_devname(struct seq_file *m, struct dentry *root) seq_puts(m, "none"); return 0; } - + switch (volume->type) { case AFSVL_RWVOL: break; @@ -269,7 +269,7 @@ static int afs_parse_device_name(struct afs_mount_params *params, int cellnamesz; _enter(",%s", name); - + if (!name) { printk(KERN_ERR "kAFS: no volume name specified\n"); return -EINVAL; @@ -418,7 +418,10 @@ static int afs_fill_super(struct super_block *sb, if (!sb->s_root) goto error; - sb->s_d_op = &afs_fs_dentry_operations; + if (params->dyn_root) + sb->s_d_op = &afs_dynroot_dentry_operations; + else + sb->s_d_op = &afs_fs_dentry_operations; _leave(" = 0"); return 0; @@ -676,7 +679,7 @@ static int afs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bfree = 0; return 0; } - + key = afs_request_key(vnode->volume->cell); if (IS_ERR(key)) return PTR_ERR(key); diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index 5d8562f1ad4a..1ed7e2fd2f35 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -303,7 +303,7 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_net *net, r->uuid.clock_seq_hi_and_reserved = htonl(u->clock_seq_hi_and_reserved); r->uuid.clock_seq_low = htonl(u->clock_seq_low); for (i = 0; i < 6; i++) - r->uuid.node[i] = ntohl(u->node[i]); + r->uuid.node[i] = htonl(u->node[i]); trace_afs_make_vl_call(call); return (struct afs_addr_list *)afs_make_call(ac, call, GFP_KERNEL, false); @@ -450,7 +450,7 @@ again: call->count2 = ntohl(*bp); /* Type or next count */ if (call->count > YFS_MAXENDPOINTS) - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); alist = afs_alloc_addrlist(call->count, FS_SERVICE, AFS_FS_PORT); if (!alist) @@ -474,7 +474,7 @@ again: size = sizeof(__be32) * (1 + 4 + 1); break; default: - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); } size += sizeof(__be32); @@ -487,24 +487,24 @@ again: switch (call->count2) { case YFS_ENDPOINT_IPV4: if (ntohl(bp[0]) != sizeof(__be32) * 2) - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); afs_merge_fs_addr4(alist, bp[1], ntohl(bp[2])); bp += 3; break; case YFS_ENDPOINT_IPV6: if (ntohl(bp[0]) != sizeof(__be32) * 5) - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); afs_merge_fs_addr6(alist, bp + 1, ntohl(bp[5])); bp += 6; break; default: - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); } /* Got either the type of the next entry or the count of * volEndpoints if no more fsEndpoints. */ - call->count2 = htonl(*bp++); + call->count2 = ntohl(*bp++); call->offset = 0; call->count--; @@ -517,7 +517,7 @@ again: if (!call->count) goto end; if (call->count > YFS_MAXENDPOINTS) - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); call->unmarshall = 3; @@ -531,7 +531,7 @@ again: return ret; bp = call->buffer; - call->count2 = htonl(*bp++); + call->count2 = ntohl(*bp++); call->offset = 0; call->unmarshall = 4; @@ -545,7 +545,7 @@ again: size = sizeof(__be32) * (1 + 4 + 1); break; default: - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); } if (call->count > 1) @@ -558,16 +558,16 @@ again: switch (call->count2) { case YFS_ENDPOINT_IPV4: if (ntohl(bp[0]) != sizeof(__be32) * 2) - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); bp += 3; break; case YFS_ENDPOINT_IPV6: if (ntohl(bp[0]) != sizeof(__be32) * 5) - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); bp += 6; break; default: - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); } /* Got either the type of the next entry or the count of @@ -576,7 +576,7 @@ again: call->offset = 0; call->count--; if (call->count > 0) { - call->count2 = htonl(*bp++); + call->count2 = ntohl(*bp++); goto again; } diff --git a/fs/afs/volume.c b/fs/afs/volume.c index b517a588781f..3037bd01f617 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -225,7 +225,9 @@ void afs_activate_volume(struct afs_volume *volume) #ifdef CONFIG_AFS_FSCACHE volume->cache = fscache_acquire_cookie(volume->cell->cache, &afs_volume_cache_index_def, - volume, true); + &volume->vid, sizeof(volume->vid), + NULL, 0, + volume, 0, true); #endif write_lock(&volume->cell->proc_lock); @@ -245,7 +247,7 @@ void afs_deactivate_volume(struct afs_volume *volume) write_unlock(&volume->cell->proc_lock); #ifdef CONFIG_AFS_FSCACHE - fscache_relinquish_cookie(volume->cache, + fscache_relinquish_cookie(volume->cache, NULL, test_bit(AFS_VOLUME_DELETED, &volume->flags)); volume->cache = NULL; #endif diff --git a/fs/afs/write.c b/fs/afs/write.c index 9370e2feb999..c164698dc304 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -42,10 +42,11 @@ static int afs_fill_page(struct afs_vnode *vnode, struct key *key, if (!req) return -ENOMEM; - atomic_set(&req->usage, 1); + refcount_set(&req->usage, 1); req->pos = pos; req->len = len; req->nr_pages = 1; + req->pages = req->array; req->pages[0] = page; get_page(page); @@ -124,7 +125,12 @@ try_again: page->index, priv); goto flush_conflicting_write; } - if (to < f || from > t) + /* If the file is being filled locally, allow inter-write + * spaces to be merged into writes. If it's not, only write + * back what the user gives us. + */ + if (!test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags) && + (to < f || from > t)) goto flush_conflicting_write; if (from < f) f = from; @@ -355,6 +361,12 @@ found_key: } switch (ret) { + case 0: + afs_stat_v(vnode, n_stores); + atomic_long_add((last * PAGE_SIZE + to) - + (first * PAGE_SIZE + offset), + &afs_v2net(vnode)->n_store_bytes); + break; case -EACCES: case -EPERM: case -ENOKEY: @@ -412,7 +424,8 @@ static int afs_write_back_from_locked_page(struct address_space *mapping, trace_afs_page_dirty(vnode, tracepoint_string("WARN"), primary_page->index, priv); - if (start >= final_page || to < PAGE_SIZE) + if (start >= final_page || + (to < PAGE_SIZE && !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags))) goto no_more; start++; @@ -433,9 +446,10 @@ static int afs_write_back_from_locked_page(struct address_space *mapping, } for (loop = 0; loop < n; loop++) { - if (to != PAGE_SIZE) - break; page = pages[loop]; + if (to != PAGE_SIZE && + !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags)) + break; if (page->index > final_page) break; if (!trylock_page(page)) @@ -448,7 +462,8 @@ static int afs_write_back_from_locked_page(struct address_space *mapping, priv = page_private(page); f = priv & AFS_PRIV_MAX; t = priv >> AFS_PRIV_SHIFT; - if (f != 0) { + if (f != 0 && + !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags)) { unlock_page(page); break; } @@ -570,10 +585,11 @@ static int afs_writepages_region(struct address_space *mapping, _debug("wback %lx", page->index); - /* at this point we hold neither mapping->tree_lock nor lock on - * the page itself: the page may be truncated or invalidated - * (changing page->mapping to NULL), or even swizzled back from - * swapper_space to tmpfs file mapping + /* + * at this point we hold neither the i_pages lock nor the + * page lock: the page may be truncated or invalidated + * (changing page->mapping to NULL), or even swizzled + * back from swapper_space to tmpfs file mapping */ ret = lock_page_killable(page); if (ret < 0) { @@ -734,20 +750,6 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) } /* - * Flush out all outstanding writes on a file opened for writing when it is - * closed. - */ -int afs_flush(struct file *file, fl_owner_t id) -{ - _enter(""); - - if ((file->f_mode & FMODE_WRITE) == 0) - return 0; - - return vfs_fsync(file, 0); -} - -/* * notification that a previously read-only page is about to become writable * - if it returns an error, the caller will deliver a bus error signal */ diff --git a/fs/afs/xdr_fs.h b/fs/afs/xdr_fs.h new file mode 100644 index 000000000000..aa21f3068d52 --- /dev/null +++ b/fs/afs/xdr_fs.h @@ -0,0 +1,103 @@ +/* AFS fileserver XDR types + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#ifndef XDR_FS_H +#define XDR_FS_H + +struct afs_xdr_AFSFetchStatus { + __be32 if_version; +#define AFS_FSTATUS_VERSION 1 + __be32 type; + __be32 nlink; + __be32 size_lo; + __be32 data_version_lo; + __be32 author; + __be32 owner; + __be32 caller_access; + __be32 anon_access; + __be32 mode; + __be32 parent_vnode; + __be32 parent_unique; + __be32 seg_size; + __be32 mtime_client; + __be32 mtime_server; + __be32 group; + __be32 sync_counter; + __be32 data_version_hi; + __be32 lock_count; + __be32 size_hi; + __be32 abort_code; +} __packed; + +#define AFS_DIR_HASHTBL_SIZE 128 +#define AFS_DIR_DIRENT_SIZE 32 +#define AFS_DIR_SLOTS_PER_BLOCK 64 +#define AFS_DIR_BLOCK_SIZE 2048 +#define AFS_DIR_BLOCKS_PER_PAGE (PAGE_SIZE / AFS_DIR_BLOCK_SIZE) +#define AFS_DIR_MAX_SLOTS 65536 +#define AFS_DIR_BLOCKS_WITH_CTR 128 +#define AFS_DIR_MAX_BLOCKS 1023 +#define AFS_DIR_RESV_BLOCKS 1 +#define AFS_DIR_RESV_BLOCKS0 13 + +/* + * Directory entry structure. + */ +union afs_xdr_dirent { + struct { + u8 valid; + u8 unused[1]; + __be16 hash_next; + __be32 vnode; + __be32 unique; + u8 name[16]; + u8 overflow[4]; /* if any char of the name (inc + * NUL) reaches here, consume + * the next dirent too */ + } u; + u8 extended_name[32]; +} __packed; + +/* + * Directory block header (one at the beginning of every 2048-byte block). + */ +struct afs_xdr_dir_hdr { + __be16 npages; + __be16 magic; +#define AFS_DIR_MAGIC htons(1234) + u8 reserved; + u8 bitmap[8]; + u8 pad[19]; +} __packed; + +/* + * Directory block layout + */ +union afs_xdr_dir_block { + struct afs_xdr_dir_hdr hdr; + + struct { + struct afs_xdr_dir_hdr hdr; + u8 alloc_ctrs[AFS_DIR_MAX_BLOCKS]; + __be16 hashtable[AFS_DIR_HASHTBL_SIZE]; + } meta; + + union afs_xdr_dirent dirents[AFS_DIR_SLOTS_PER_BLOCK]; +} __packed; + +/* + * Directory layout on a linux VM page. + */ +struct afs_xdr_dir_page { + union afs_xdr_dir_block blocks[AFS_DIR_BLOCKS_PER_PAGE]; +}; + +#endif /* XDR_FS_H */ diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index a0c57c37fa21..be9c3dc048ab 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -19,9 +19,6 @@ */ static autofs_wqt_t autofs4_next_wait_queue = 1; -/* These are the signals we allow interrupting a pending mount */ -#define SHUTDOWN_SIGS (sigmask(SIGKILL) | sigmask(SIGINT) | sigmask(SIGQUIT)) - void autofs4_catatonic_mode(struct autofs_sb_info *sbi) { struct autofs_wait_queue *wq, *nwq; @@ -486,29 +483,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, * wq->name.name is NULL iff the lock is already released * or the mount has been made catatonic. */ - if (wq->name.name) { - /* Block all but "shutdown" signals while waiting */ - unsigned long shutdown_sigs_mask; - unsigned long irqflags; - sigset_t oldset; - - spin_lock_irqsave(¤t->sighand->siglock, irqflags); - oldset = current->blocked; - shutdown_sigs_mask = SHUTDOWN_SIGS & ~oldset.sig[0]; - siginitsetinv(¤t->blocked, shutdown_sigs_mask); - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, irqflags); - - wait_event_interruptible(wq->queue, wq->name.name == NULL); - - spin_lock_irqsave(¤t->sighand->siglock, irqflags); - current->blocked = oldset; - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, irqflags); - } else { - pr_debug("skipped sleeping\n"); - } - + wait_event_killable(wq->queue, wq->name.name == NULL); status = wq->status; /* @@ -574,7 +549,7 @@ int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_tok kfree(wq->name.name); wq->name.name = NULL; /* Do not wait on this queue */ wq->status = status; - wake_up_interruptible(&wq->queue); + wake_up(&wq->queue); if (!--wq->wait_ctr) kfree(wq); mutex_unlock(&sbi->wq_mutex); diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index ce1824f47ba6..c3deb2e35f20 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -330,6 +330,7 @@ beyond_if: #ifdef __alpha__ regs->gp = ex.a_gpvalue; #endif + finalize_exec(bprm); start_thread(regs, ex.a_entry, current->mm->start_stack); return 0; } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index bdb201230bae..41e04183e4ce 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -377,6 +377,11 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, } else map_addr = vm_mmap(filep, addr, size, prot, type, off); + if ((type & MAP_FIXED_NOREPLACE) && BAD_ADDR(map_addr)) + pr_info("%d (%s): Uhuuh, elf segment at %p requested but the memory is mapped already\n", + task_pid_nr(current), current->comm, + (void *)addr); + return(map_addr); } @@ -575,7 +580,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, elf_prot |= PROT_EXEC; vaddr = eppnt->p_vaddr; if (interp_elf_ex->e_type == ET_EXEC || load_addr_set) - elf_type |= MAP_FIXED; + elf_type |= MAP_FIXED_NOREPLACE; else if (no_base && interp_elf_ex->e_type == ET_DYN) load_addr = -vaddr; @@ -890,7 +895,7 @@ static int load_elf_binary(struct linux_binprm *bprm) the correct location in memory. */ for(i = 0, elf_ppnt = elf_phdata; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { - int elf_prot = 0, elf_flags; + int elf_prot = 0, elf_flags, elf_fixed = MAP_FIXED_NOREPLACE; unsigned long k, vaddr; unsigned long total_size = 0; @@ -922,6 +927,13 @@ static int load_elf_binary(struct linux_binprm *bprm) */ } } + + /* + * Some binaries have overlapping elf segments and then + * we have to forcefully map over an existing mapping + * e.g. over this newly established brk mapping. + */ + elf_fixed = MAP_FIXED; } if (elf_ppnt->p_flags & PF_R) @@ -939,7 +951,7 @@ static int load_elf_binary(struct linux_binprm *bprm) * the ET_DYN load_addr calculations, proceed normally. */ if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) { - elf_flags |= MAP_FIXED; + elf_flags |= elf_fixed; } else if (loc->elf_ex.e_type == ET_DYN) { /* * This logic is run once for the first LOAD Program @@ -975,7 +987,7 @@ static int load_elf_binary(struct linux_binprm *bprm) load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) load_bias += arch_mmap_rnd(); - elf_flags |= MAP_FIXED; + elf_flags |= elf_fixed; } else load_bias = 0; @@ -1155,6 +1167,7 @@ static int load_elf_binary(struct linux_binprm *bprm) ELF_PLAT_INIT(regs, reloc_func_desc); #endif + finalize_exec(bprm); start_thread(regs, elf_entry, bprm->p); retval = 0; out: @@ -1234,7 +1247,7 @@ static int load_elf_library(struct file *file) (eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr)), PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE, + MAP_FIXED_NOREPLACE | MAP_PRIVATE | MAP_DENYWRITE, (eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr))); if (error != ELF_PAGESTART(eppnt->p_vaddr)) diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 429326b6e2e7..d90993adeffa 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -463,6 +463,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) dynaddr); #endif + finalize_exec(bprm); /* everything is now ready... get the userspace context ready to roll */ entryaddr = interp_params.entry_addr ?: exec_params.entry_addr; start_thread(regs, entryaddr, current->mm->start_stack); diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 5d6b94475f27..82a48e830018 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -994,6 +994,7 @@ static int load_flat_binary(struct linux_binprm *bprm) FLAT_PLAT_INIT(regs); #endif + finalize_exec(bprm); pr_debug("start_thread(regs=0x%p, entry=0x%lx, start_stack=0x%lx)\n", regs, start_addr, current->mm->start_stack); start_thread(regs, start_addr, current->mm->start_stack); diff --git a/fs/block_dev.c b/fs/block_dev.c index fe09ef9c21f3..7ec920e27065 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1324,7 +1324,8 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty) * @bdev: struct bdev to adjust. * * This routine checks to see if the bdev size does not match the disk size - * and adjusts it if it differs. + * and adjusts it if it differs. When shrinking the bdev size, its all caches + * are freed. */ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) { @@ -1337,7 +1338,8 @@ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) "%s: detected capacity change from %lld to %lld\n", disk->disk_name, bdev_size, disk_size); i_size_write(bdev->bd_inode, disk_size); - flush_disk(bdev, false); + if (bdev_size > disk_size) + flush_disk(bdev, false); } } EXPORT_SYMBOL(check_disk_size_change); @@ -1946,11 +1948,6 @@ static int blkdev_releasepage(struct page *page, gfp_t wait) static int blkdev_writepages(struct address_space *mapping, struct writeback_control *wbc) { - if (dax_mapping(mapping)) { - struct block_device *bdev = I_BDEV(mapping->host); - - return dax_writeback_mapping_range(mapping, bdev, wbc); - } return generic_writepages(mapping, wbc); } diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 167e5dc7eadd..23537bc8c827 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 + config BTRFS_FS tristate "Btrfs filesystem support" select LIBCRC32C diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 0066d95b133f..15e1dfef56a5 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Red Hat. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/fs.h> diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index d5540749f0e5..d522494698fa 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -1,20 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. * Copyright (C) 2014 Fujitsu. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/kthread.h> diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index fc957e00cef1..7861c9feba5f 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -1,24 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2007 Oracle. All rights reserved. * Copyright (C) 2014 Fujitsu. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS_ASYNC_THREAD_ -#define __BTRFS_ASYNC_THREAD_ +#ifndef BTRFS_ASYNC_THREAD_H +#define BTRFS_ASYNC_THREAD_H + #include <linux/workqueue.h> struct btrfs_fs_info; @@ -85,4 +73,5 @@ void btrfs_set_work_high_priority(struct btrfs_work *work); struct btrfs_fs_info *btrfs_work_owner(const struct btrfs_work *work); struct btrfs_fs_info *btrfs_workqueue_owner(const struct __btrfs_workqueue *wq); bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq); + #endif diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 571024bc632e..0a8e2e29a66b 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2011 STRATO. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/mm.h> diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 0a30028d5196..54d58988483a 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2011 STRATO. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS_BACKREF__ -#define __BTRFS_BACKREF__ +#ifndef BTRFS_BACKREF_H +#define BTRFS_BACKREF_H #include <linux/btrfs.h> #include "ulist.h" diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index ca15be569d69..234bae55b85d 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS_I__ -#define __BTRFS_I__ +#ifndef BTRFS_INODE_H +#define BTRFS_INODE_H #include <linux/hash.h> #include "extent_map.h" diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 3baebbc021c5..dc062b195c46 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) STRATO AG 2011. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ /* diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h index 2de58a99ee92..9bf4359cc44c 100644 --- a/fs/btrfs/check-integrity.h +++ b/fs/btrfs/check-integrity.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) STRATO AG 2011. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#if !defined(__BTRFS_CHECK_INTEGRITY__) -#define __BTRFS_CHECK_INTEGRITY__ +#ifndef BTRFS_CHECK_INTEGRITY_H +#define BTRFS_CHECK_INTEGRITY_H #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 562c3e633403..1061575a7d25 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/kernel.h> @@ -458,7 +445,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, break; rcu_read_lock(); - page = radix_tree_lookup(&mapping->page_tree, pg_index); + page = radix_tree_lookup(&mapping->i_pages, pg_index); rcu_read_unlock(); if (page && !radix_tree_exceptional_entry(page)) { misses++; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index ce796557a918..cc605f7b23fb 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS_COMPRESSION_ -#define __BTRFS_COMPRESSION_ +#ifndef BTRFS_COMPRESSION_H +#define BTRFS_COMPRESSION_H /* * We want to make sure that amount of RAM required to uncompress an extent is diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a2c9d21176e2..3fd44835b386 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007,2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/sched.h> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0eb55825862a..5474ef14d6e6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS_CTREE__ -#define __BTRFS_CTREE__ +#ifndef BTRFS_CTREE_H +#define BTRFS_CTREE_H #include <linux/mm.h> #include <linux/sched/signal.h> @@ -3752,4 +3739,5 @@ static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) #endif return 0; } + #endif diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h index 83ebfe28da9e..90281a7a35a8 100644 --- a/fs/btrfs/dedupe.h +++ b/fs/btrfs/dedupe.h @@ -1,24 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2016 Fujitsu. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS_DEDUPE__ -#define __BTRFS_DEDUPE__ +#ifndef BTRFS_DEDUPE_H +#define BTRFS_DEDUPE_H /* later in-band dedupe will expand this struct */ struct btrfs_dedupe_hash; + #endif diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 86ec2edc05e8..06ec8ab6d9ba 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1,20 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2011 Fujitsu. All rights reserved. * Written by Miao Xie <miaox@cn.fujitsu.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/slab.h> diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 100a91e26b55..ca7a97f3ab6b 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -1,24 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2011 Fujitsu. All rights reserved. * Written by Miao Xie <miaox@cn.fujitsu.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __DELAYED_TREE_OPERATION_H -#define __DELAYED_TREE_OPERATION_H +#ifndef BTRFS_DELAYED_INODE_H +#define BTRFS_DELAYED_INODE_H #include <linux/rbtree.h> #include <linux/spinlock.h> diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 2677257c149d..9e98295de7ce 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2009 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/sched.h> diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 9e3e5aff0937..741869dbc316 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -1,22 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __DELAYED_REF__ -#define __DELAYED_REF__ + +#ifndef BTRFS_DELAYED_REF_H +#define BTRFS_DELAYED_REF_H #include <linux/refcount.h> @@ -298,4 +286,5 @@ btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node) { return container_of(node, struct btrfs_delayed_data_ref, node); } + #endif diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 0d203633bb96..f82be266ba4b 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) STRATO AG 2012. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ + #include <linux/sched.h> #include <linux/bio.h> #include <linux/slab.h> diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 8566a02ef222..b6d4206188bb 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) STRATO AG 2012. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#if !defined(__BTRFS_DEV_REPLACE__) -#define __BTRFS_DEV_REPLACE__ +#ifndef BTRFS_DEV_REPLACE_H +#define BTRFS_DEV_REPLACE_H struct btrfs_ioctl_dev_replace_args; @@ -48,4 +35,5 @@ static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value) { atomic64_inc(stat_value); } + #endif diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 29e967b2c667..39e9766d1cbd 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include "ctree.h" diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 07b5e6f7df67..4ac8b1d21baf 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/fs.h> @@ -449,6 +436,14 @@ static int verify_level_key(struct btrfs_fs_info *fs_info, if (!first_key) return 0; + /* + * For live tree block (new tree blocks in current transaction), + * we need proper lock context to avoid race, which is impossible here. + * So we only checks tree blocks which is read from disk, whose + * generation <= fs_info->last_trans_committed. + */ + if (btrfs_header_generation(eb) > fs_info->last_trans_committed) + return 0; if (found_level) btrfs_node_key_to_cpu(eb, &found_key, 0); else @@ -3812,7 +3807,8 @@ void close_ctree(struct btrfs_fs_info *fs_info) btrfs_err(fs_info, "commit super ret %d", ret); } - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state) || + test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state)) btrfs_error_commit_super(fs_info); kthread_stop(fs_info->transaction_kthread); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 453ea9f5d4e9..1a3d277b027b 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __DISKIO__ -#define __DISKIO__ +#ifndef BTRFS_DISK_IO_H +#define BTRFS_DISK_IO_H #define BTRFS_SUPER_INFO_OFFSET SZ_64K #define BTRFS_SUPER_INFO_SIZE 4096 @@ -169,4 +156,5 @@ static inline void btrfs_set_buffer_lockdep_class(u64 objectid, { } #endif + #endif diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index ddaccad469f8..1f3755b3a37a 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 + #include <linux/fs.h> #include <linux/types.h> #include "ctree.h" diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h index 91b3908e7c54..57488ecd7d4e 100644 --- a/fs/btrfs/export.h +++ b/fs/btrfs/export.h @@ -1,4 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ + #ifndef BTRFS_EXPORT_H #define BTRFS_EXPORT_H diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e08d0d45af4f..75cfb80d2551 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ + #include <linux/sched.h> #include <linux/sched/signal.h> #include <linux/pagemap.h> @@ -4642,6 +4630,7 @@ again: if (wait_for_alloc) { mutex_unlock(&fs_info->chunk_mutex); wait_for_alloc = 0; + cond_resched(); goto again; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 47a8fe9d22e8..e99b329002cf 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 + #include <linux/bitops.h> #include <linux/slab.h> #include <linux/bio.h> @@ -3963,11 +3964,11 @@ retry: done_index = page->index; /* - * At this point we hold neither mapping->tree_lock nor - * lock on the page itself: the page may be truncated or - * invalidated (changing page->mapping to NULL), or even - * swizzled back from swapper_space to tmpfs file - * mapping + * At this point we hold neither the i_pages lock nor + * the page lock: the page may be truncated or + * invalidated (changing page->mapping to NULL), + * or even swizzled back from swapper_space to + * tmpfs file mapping */ if (!trylock_page(page)) { flush_write_bio(epd); @@ -5174,13 +5175,13 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb) WARN_ON(!PagePrivate(page)); clear_page_dirty_for_io(page); - spin_lock_irq(&page->mapping->tree_lock); + xa_lock_irq(&page->mapping->i_pages); if (!PageDirty(page)) { - radix_tree_tag_clear(&page->mapping->page_tree, + radix_tree_tag_clear(&page->mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); } - spin_unlock_irq(&page->mapping->tree_lock); + xa_unlock_irq(&page->mapping->i_pages); ClearPageError(page); unlock_page(page); } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index b77d84909863..a53009694b16 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __EXTENTIO__ -#define __EXTENTIO__ + +#ifndef BTRFS_EXTENT_IO_H +#define BTRFS_EXTENT_IO_H #include <linux/rbtree.h> #include <linux/refcount.h> @@ -572,4 +573,5 @@ noinline u64 find_lock_delalloc_range(struct inode *inode, #endif struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); + #endif diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 53a0633c6ef7..1b8a078f92eb 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 + #include <linux/err.h> #include <linux/slab.h> #include <linux/spinlock.h> diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index f6f8ba114977..5fcb80a6ce37 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __EXTENTMAP__ -#define __EXTENTMAP__ + +#ifndef BTRFS_EXTENT_MAP_H +#define BTRFS_EXTENT_MAP_H #include <linux/rbtree.h> #include <linux/refcount.h> @@ -93,4 +94,5 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); int btrfs_add_extent_mapping(struct extent_map_tree *em_tree, struct extent_map **em_in, u64 start, u64 len); + #endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index fdcb41002623..f9dd6d1836a3 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/bio.h> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f247300170e5..0167a9c97c9c 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/fs.h> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index d0dde9e6afd7..e5b569bebc73 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2008 Red Hat. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/pagemap.h> diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 79eca4cabb1c..15e30b93db0d 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2009 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS_FREE_SPACE_CACHE -#define __BTRFS_FREE_SPACE_CACHE +#ifndef BTRFS_FREE_SPACE_CACHE_H +#define BTRFS_FREE_SPACE_CACHE_H struct btrfs_free_space { struct rb_node offset_index; diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index af36a6a971fe..32a0f6cb5594 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015 Facebook. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/kernel.h> diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h index ba3787df43c3..874b4feecad2 100644 --- a/fs/btrfs/free-space-tree.h +++ b/fs/btrfs/free-space-tree.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2015 Facebook. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS_FREE_SPACE_TREE -#define __BTRFS_FREE_SPACE_TREE +#ifndef BTRFS_FREE_SPACE_TREE_H +#define BTRFS_FREE_SPACE_TREE_H /* * The default size for new free space bitmap items. The last bitmap in a block diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 1d5631ef2738..a8956a3c9e05 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include "ctree.h" diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 9409dcc7020d..12fcd8897c33 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/delay.h> diff --git a/fs/btrfs/inode-map.h b/fs/btrfs/inode-map.h index 6734ec92a1e9..7a962811dffe 100644 --- a/fs/btrfs/inode-map.h +++ b/fs/btrfs/inode-map.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __BTRFS_INODE_MAP -#define __BTRFS_INODE_MAP + +#ifndef BTRFS_INODE_MAP_H +#define BTRFS_INODE_MAP_H void btrfs_init_free_ino_ctl(struct btrfs_root *root); void btrfs_unpin_free_ino(struct btrfs_root *root); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1f091c2358a4..e064c49c9a9a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/kernel.h> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index b2db3988813f..632e26d6f7ce 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/kernel.h> diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 621083f8932c..e4faefac9d16 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ + #include <linux/sched.h> #include <linux/pagemap.h> #include <linux/spinlock.h> diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index c44a9d5f5362..29135def468e 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS_LOCKING_ -#define __BTRFS_LOCKING_ +#ifndef BTRFS_LOCKING_H +#define BTRFS_LOCKING_H #define BTRFS_WRITE_LOCK 1 #define BTRFS_READ_LOCK 2 diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 1c7f7f70caf4..0667ea07f766 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/kernel.h> diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h index 1b10a3cd1195..75246f2f56ba 100644 --- a/fs/btrfs/math.h +++ b/fs/btrfs/math.h @@ -1,25 +1,11 @@ - +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2012 Fujitsu. All rights reserved. * Written by Miao Xie <miaox@cn.fujitsu.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS_MATH_H -#define __BTRFS_MATH_H +#ifndef BTRFS_MATH_H +#define BTRFS_MATH_H #include <asm/div64.h> diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 661cc3db0c7c..6db8bb2f2c28 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/slab.h> diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 4a1672a13ba6..3be443fb3001 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS_ORDERED_DATA__ -#define __BTRFS_ORDERED_DATA__ +#ifndef BTRFS_ORDERED_DATA_H +#define BTRFS_ORDERED_DATA_H /* one of these per inode */ struct btrfs_ordered_inode_tree { @@ -218,4 +205,5 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); int __init ordered_data_init(void); void __cold ordered_data_exit(void); + #endif diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c index 47767d5b8f0b..aa534108c1e2 100644 --- a/fs/btrfs/orphan.c +++ b/fs/btrfs/orphan.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2008 Red Hat. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include "ctree.h" diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 4a8770485f77..124276bba8cf 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include "ctree.h" diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h index 3afd508ed8c5..4a98481688f4 100644 --- a/fs/btrfs/print-tree.h +++ b/fs/btrfs/print-tree.h @@ -1,23 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __PRINT_TREE_ -#define __PRINT_TREE_ +#ifndef BTRFS_PRINT_TREE_H +#define BTRFS_PRINT_TREE_H + void btrfs_print_leaf(struct extent_buffer *l); void btrfs_print_tree(struct extent_buffer *c); + #endif diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 5859f7d3cf3e..53a8c95828e3 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/hashtable.h> diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h index 100f18829d50..618815b4f9d5 100644 --- a/fs/btrfs/props.h +++ b/fs/btrfs/props.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS_PROPS_H -#define __BTRFS_PROPS_H +#ifndef BTRFS_PROPS_H +#define BTRFS_PROPS_H #include "ctree.h" diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index f583f13ff26e..09c7e4fd550f 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2011 STRATO. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/sched.h> diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index e63e2d497a8e..d60dd06445ce 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2014 Facebook. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS_QGROUP__ -#define __BTRFS_QGROUP__ +#ifndef BTRFS_QGROUP_H +#define BTRFS_QGROUP_H #include "ulist.h" #include "delayed-ref.h" @@ -341,4 +328,5 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root); void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes); void btrfs_qgroup_check_reserved_leak(struct inode *inode); -#endif /* __BTRFS_QGROUP__ */ + +#endif diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index c3a2bc8af675..9abd950e7f78 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1,21 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2012 Fusion-io All rights reserved. * Copyright (C) 2012 Intel Corp. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ + #include <linux/sched.h> #include <linux/wait.h> #include <linux/bio.h> diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 4ee4fe346838..f5d4c13a8dbc 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -1,24 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2012 Fusion-io All rights reserved. * Copyright (C) 2012 Intel Corp. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS_RAID56__ -#define __BTRFS_RAID56__ +#ifndef BTRFS_RAID56_H +#define BTRFS_RAID56_H + static inline int nr_parity_stripes(struct map_lookup *map) { if (map->type & BTRFS_BLOCK_GROUP_RAID5) @@ -65,4 +53,5 @@ void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio); int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); + #endif diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h index 9e111e4576d4..a97dc74a4d3d 100644 --- a/fs/btrfs/rcu-string.h +++ b/fs/btrfs/rcu-string.h @@ -1,21 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2012 Red Hat. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ +#ifndef BTRFS_RCU_STRING_H +#define BTRFS_RCU_STRING_H + struct rcu_string { struct rcu_head rcu; char str[0]; @@ -54,3 +44,5 @@ static inline void rcu_string_free(struct rcu_string *str) struct rcu_string *__str = rcu_dereference(rcu_str); \ __str->str; \ }) + +#endif diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index a52dd12af648..40f1bcef394d 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2011 STRATO. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/sched.h> diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 35fab67dcbe8..e5b9e596bb92 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2014 Facebook. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/sched.h> diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h index 3bf02ce0e1e2..b7d2a4edfdb7 100644 --- a/fs/btrfs/ref-verify.h +++ b/fs/btrfs/ref-verify.h @@ -1,22 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2014 Facebook. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __REF_VERIFY__ -#define __REF_VERIFY__ + +#ifndef BTRFS_REF_VERIFY_H +#define BTRFS_REF_VERIFY_H #ifdef CONFIG_BTRFS_FS_REF_VERIFY int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info); @@ -59,4 +47,5 @@ static inline void btrfs_init_ref_verify(struct btrfs_fs_info *fs_info) } #endif /* CONFIG_BTRFS_FS_REF_VERIFY */ -#endif /* _REF_VERIFY__ */ + +#endif diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 4874c09f6d3c..00b7d3231821 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2009 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/sched.h> diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index aab0194efe46..6db3bda44aa5 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/err.h> diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 1a2066ac6fe7..52b39a0924e9 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2011, 2012 STRATO. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/blkdev.h> diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 1f5748c7d1c7..221e5cdb060b 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2012 Alexander Block. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/bsearch.h> diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 3aa4bc55754f..ead397f7034f 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -1,22 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2012 Alexander Block. All rights reserved. * Copyright (C) 2012 STRATO. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ +#ifndef BTRFS_SEND_H +#define BTRFS_SEND_H + #include "ctree.h" #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" @@ -132,3 +122,5 @@ enum { #ifdef __KERNEL__ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg); #endif + +#endif diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index 5e2b92d83617..b7b4acb12833 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/highmem.h> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 170baef49fae..0628092b0b1b 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/blkdev.h> diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index ca067471cd46..4848a4318fb5 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/sched.h> diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 80457f31c29f..b567560d9aa9 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BTRFS_SYSFS_H_ -#define _BTRFS_SYSFS_H_ + +#ifndef BTRFS_SYSFS_H +#define BTRFS_SYSFS_H /* * Data exported through sysfs @@ -90,4 +91,4 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, u64 bit, enum btrfs_feature_set set); -#endif /* _BTRFS_SYSFS_H_ */ +#endif diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index e74278170806..30ed438da2a9 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2013 Fusion IO. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/fs.h> diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index bc0615bac3cc..a5a0b9500d3e 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2013 Fusion IO. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS_TESTS -#define __BTRFS_TESTS +#ifndef BTRFS_TESTS_H +#define BTRFS_TESTS_H #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS int btrfs_run_sanity_tests(void); diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index b9142c614114..31e8a9ec228c 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2013 Fusion IO. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/slab.h> diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 2e7f64a3b22b..76aa5a678a96 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2013 Fusion IO. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/pagemap.h> diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index c23bd00bdd92..79e0a5f4d9c9 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2017 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/types.h> diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index eca6412d42bd..d3c9f8a59ba5 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2013 Fusion IO. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/slab.h> diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index 8444a018cca2..e1f9666c4974 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015 Facebook. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/types.h> diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 13420cd19ef0..e0ba799536b4 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2013 Fusion IO. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/types.h> diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 160eb2fba726..39b95783f736 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2013 Facebook. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/types.h> diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 5c4cf0f9146b..63fdcab64b01 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/fs.h> diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index b6c94ce33503..c88fccd80bc5 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS_TRANSACTION__ -#define __BTRFS_TRANSACTION__ +#ifndef BTRFS_TRANSACTION_H +#define BTRFS_TRANSACTION_H #include <linux/refcount.h> #include "btrfs_inode.h" @@ -228,4 +215,5 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction); void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info); void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); + #endif diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 8871286c1a91..8d40e7dd8c30 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1,17 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) Qu Wenruo 2017. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program. */ /* diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index aba542755710..ff043275b784 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -1,21 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) Qu Wenruo 2017. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program. */ -#ifndef __BTRFS_TREE_CHECKER__ -#define __BTRFS_TREE_CHECKER__ +#ifndef BTRFS_TREE_CHECKER_H +#define BTRFS_TREE_CHECKER_H #include "ctree.h" #include "extent_io.h" diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index c09dbe4bd6e7..3c0987ab587d 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/sched.h> diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c91babc6aa4b..43758e30aa7a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/sched.h> @@ -2352,8 +2339,10 @@ again: nritems = btrfs_header_nritems(path->nodes[0]); if (path->slots[0] >= nritems) { ret = btrfs_next_leaf(root, path); - if (ret) + if (ret == 1) break; + else if (ret < 0) + goto out; } btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); @@ -2456,13 +2445,41 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, if (ret) break; - /* for regular files, make sure corresponding - * orphan item exist. extents past the new EOF - * will be truncated later by orphan cleanup. + /* + * Before replaying extents, truncate the inode to its + * size. We need to do it now and not after log replay + * because before an fsync we can have prealloc extents + * added beyond the inode's i_size. If we did it after, + * through orphan cleanup for example, we would drop + * those prealloc extents just after replaying them. */ if (S_ISREG(mode)) { - ret = insert_orphan_item(wc->trans, root, - key.objectid); + struct inode *inode; + u64 from; + + inode = read_one_inode(root, key.objectid); + if (!inode) { + ret = -EIO; + break; + } + from = ALIGN(i_size_read(inode), + root->fs_info->sectorsize); + ret = btrfs_drop_extents(wc->trans, root, inode, + from, (u64)-1, 1); + /* + * If the nlink count is zero here, the iput + * will free the inode. We bump it to make + * sure it doesn't get freed until the link + * count fixup is done. + */ + if (!ret) { + if (inode->i_nlink == 0) + inc_nlink(inode); + /* Update link count and nbytes. */ + ret = btrfs_update_inode(wc->trans, + root, inode); + } + iput(inode); if (ret) break; } @@ -3520,8 +3537,11 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, * from this directory and from this transaction */ ret = btrfs_next_leaf(root, path); - if (ret == 1) { - last_offset = (u64)-1; + if (ret) { + if (ret == 1) + last_offset = (u64)-1; + else + err = ret; goto done; } btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); @@ -4354,6 +4374,31 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, num++; } + /* + * Add all prealloc extents beyond the inode's i_size to make sure we + * don't lose them after doing a fast fsync and replaying the log. + */ + if (inode->flags & BTRFS_INODE_PREALLOC) { + struct rb_node *node; + + for (node = rb_last(&tree->map); node; node = rb_prev(node)) { + em = rb_entry(node, struct extent_map, rb_node); + if (em->start < i_size_read(&inode->vfs_inode)) + break; + if (!list_empty(&em->list)) + continue; + /* Same as above loop. */ + if (++num > 32768) { + list_del_init(&tree->modified_extents); + ret = -EFBIG; + goto process; + } + refcount_inc(&em->refs); + set_bit(EXTENT_FLAG_LOGGING, &em->flags); + list_add_tail(&em->list, &extents); + } + } + list_sort(NULL, &extents, extent_cmp); btrfs_get_logged_extents(inode, logged_list, logged_start, logged_end); /* diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 88abc43312a1..122e68b89a5a 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __TREE_LOG_ -#define __TREE_LOG_ +#ifndef BTRFS_TREE_LOG_H +#define BTRFS_TREE_LOG_H #include "ctree.h" #include "transaction.h" @@ -87,4 +74,5 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, int btrfs_log_new_name(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_inode *old_dir, struct dentry *parent); + #endif diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index d8edf164f81c..3374c9e9be67 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -1,7 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2011 STRATO AG * written by Arne Jansen <sensille@gmx.net> - * Distributed under the GNU GPL license version 2. */ #include <linux/slab.h> diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index 53c913632733..02fda0a2d4ce 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -1,12 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2011 STRATO AG * written by Arne Jansen <sensille@gmx.net> - * Distributed under the GNU GPL license version 2. - * */ -#ifndef __ULIST__ -#define __ULIST__ +#ifndef BTRFS_ULIST_H +#define BTRFS_ULIST_H #include <linux/list.h> #include <linux/rbtree.h> diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 9916f03430bc..1ba7ca2a4200 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) STRATO AG 2013. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ + #include <linux/uuid.h> #include <asm/unaligned.h> #include "ctree.h" diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 93f8f17cacca..292266f6ab9c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ + #include <linux/sched.h> #include <linux/bio.h> #include <linux/slab.h> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index d1fcaea9fef5..79096884654f 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS_VOLUMES_ -#define __BTRFS_VOLUMES_ +#ifndef BTRFS_VOLUMES_H +#define BTRFS_VOLUMES_H #include <linux/bio.h> #include <linux/sort.h> diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index e1e8177deb5e..ea78c3d6dcfc 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Red Hat. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/init.h> @@ -32,7 +19,6 @@ #include "props.h" #include "locking.h" - int btrfs_getxattr(struct inode *inode, const char *name, void *buffer, size_t size) { diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index e215a3212a2a..471fcac6ff55 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2007 Red Hat. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __XATTR__ -#define __XATTR__ +#ifndef BTRFS_XATTR_H +#define BTRFS_XATTR_H #include <linux/xattr.h> @@ -34,4 +21,4 @@ int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, const struct qstr *qstr); -#endif /* __XATTR__ */ +#endif diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 2b52950dc2c6..970ff3e35bb3 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -1,20 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2008 Oracle. All rights reserved. * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * * Based on jffs2 zlib code: * Copyright © 2001-2007 Red Hat, Inc. * Created by David Woodhouse <dwmw2@infradead.org> diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 01a4eab602a3..af6ec59972f5 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -1,16 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2016-present, Facebook, Inc. * All rights reserved. * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ + #include <linux/bio.h> #include <linux/err.h> #include <linux/init.h> diff --git a/fs/buffer.c b/fs/buffer.c index 9a73924db22f..249b83fafe48 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -185,10 +185,9 @@ EXPORT_SYMBOL(end_buffer_write_sync); * we get exclusion from try_to_free_buffers with the blockdev mapping's * private_lock. * - * Hack idea: for the blockdev mapping, i_bufferlist_lock contention + * Hack idea: for the blockdev mapping, private_lock contention * may be quite high. This code could TryLock the page, and if that - * succeeds, there is no need to take private_lock. (But if - * private_lock is contended then so is mapping->tree_lock). + * succeeds, there is no need to take private_lock. */ static struct buffer_head * __find_get_block_slow(struct block_device *bdev, sector_t block) @@ -495,35 +494,12 @@ repeat: return err; } -static void do_thaw_one(struct super_block *sb, void *unused) +void emergency_thaw_bdev(struct super_block *sb) { while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb)) printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev); } -static void do_thaw_all(struct work_struct *work) -{ - iterate_supers(do_thaw_one, NULL); - kfree(work); - printk(KERN_WARNING "Emergency Thaw complete\n"); -} - -/** - * emergency_thaw_all -- forcibly thaw every frozen filesystem - * - * Used for emergency unfreeze of all filesystems via SysRq - */ -void emergency_thaw_all(void) -{ - struct work_struct *work; - - work = kmalloc(sizeof(*work), GFP_ATOMIC); - if (work) { - INIT_WORK(work, do_thaw_all); - schedule_work(work); - } -} - /** * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers * @mapping: the mapping which wants those buffers written @@ -594,20 +570,21 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); * * The caller must hold lock_page_memcg(). */ -static void __set_page_dirty(struct page *page, struct address_space *mapping, +void __set_page_dirty(struct page *page, struct address_space *mapping, int warn) { unsigned long flags; - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->page_tree, + radix_tree_tag_set(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); } - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); } +EXPORT_SYMBOL_GPL(__set_page_dirty); /* * Add a page to the dirty page list. @@ -1095,7 +1072,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, * inode list. * * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, - * mapping->tree_lock and mapping->host->i_lock. + * i_pages lock and mapping->host->i_lock. */ void mark_buffer_dirty(struct buffer_head *bh) { @@ -1511,7 +1488,7 @@ void block_invalidatepage(struct page *page, unsigned int offset, * The get_block cached value has been unconditionally invalidated, * so real IO is not possible anymore. */ - if (offset == 0) + if (length == PAGE_SIZE) try_to_release_page(page, 0); out: return; diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index e7f16a77a22a..222bc5d8b62c 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -32,7 +32,7 @@ static struct fscache_object *cachefiles_alloc_object( struct cachefiles_cache *cache; struct cachefiles_xattr *auxdata; unsigned keylen, auxlen; - void *buffer; + void *buffer, *p; char *key; cache = container_of(_cache, struct cachefiles_cache, cache); @@ -65,8 +65,12 @@ static struct fscache_object *cachefiles_alloc_object( if (!buffer) goto nomem_buffer; - keylen = cookie->def->get_key(cookie->netfs_data, buffer + 2, 512); - ASSERTCMP(keylen, <, 512); + keylen = cookie->key_len; + if (keylen <= sizeof(cookie->inline_key)) + p = cookie->inline_key; + else + p = cookie->key; + memcpy(buffer + 2, p, keylen); *(uint16_t *)buffer = keylen; ((char *)buffer)[keylen + 2] = 0; @@ -80,15 +84,17 @@ static struct fscache_object *cachefiles_alloc_object( /* get hold of the auxiliary data and prepend the object type */ auxdata = buffer; - auxlen = 0; - if (cookie->def->get_aux) { - auxlen = cookie->def->get_aux(cookie->netfs_data, - auxdata->data, 511); - ASSERTCMP(auxlen, <, 511); + auxlen = cookie->aux_len; + if (auxlen) { + if (auxlen <= sizeof(cookie->inline_aux)) + p = cookie->inline_aux; + else + p = cookie->aux; + memcpy(auxdata->data, p, auxlen); } auxdata->len = auxlen + 1; - auxdata->type = cookie->def->type; + auxdata->type = cookie->type; lookup_data->auxdata = auxdata; lookup_data->key = key; @@ -177,10 +183,12 @@ static void cachefiles_lookup_complete(struct fscache_object *_object) * increment the usage count on an inode object (may fail if unmounting) */ static -struct fscache_object *cachefiles_grab_object(struct fscache_object *_object) +struct fscache_object *cachefiles_grab_object(struct fscache_object *_object, + enum fscache_obj_ref_trace why) { struct cachefiles_object *object = container_of(_object, struct cachefiles_object, fscache); + int u; _enter("{OBJ%x,%d}", _object->debug_id, atomic_read(&object->usage)); @@ -188,7 +196,9 @@ struct fscache_object *cachefiles_grab_object(struct fscache_object *_object) ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000); #endif - atomic_inc(&object->usage); + u = atomic_inc_return(&object->usage); + trace_cachefiles_ref(object, _object->cookie, + (enum cachefiles_obj_ref_trace)why, u); return &object->fscache; } @@ -202,6 +212,7 @@ static void cachefiles_update_object(struct fscache_object *_object) struct cachefiles_cache *cache; struct fscache_cookie *cookie; const struct cred *saved_cred; + const void *aux; unsigned auxlen; _enter("{OBJ%x}", _object->debug_id); @@ -216,26 +227,29 @@ static void cachefiles_update_object(struct fscache_object *_object) } cookie = object->fscache.cookie; + auxlen = cookie->aux_len; - if (!cookie->def->get_aux) { + if (!auxlen) { fscache_unuse_cookie(_object); _leave(" [no aux]"); return; } - auxdata = kmalloc(2 + 512 + 3, cachefiles_gfp); + auxdata = kmalloc(2 + auxlen + 3, cachefiles_gfp); if (!auxdata) { fscache_unuse_cookie(_object); _leave(" [nomem]"); return; } - auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511); + aux = (auxlen <= sizeof(cookie->inline_aux)) ? + cookie->inline_aux : cookie->aux; + + memcpy(auxdata->data, aux, auxlen); fscache_unuse_cookie(_object); - ASSERTCMP(auxlen, <, 511); auxdata->len = auxlen + 1; - auxdata->type = cookie->def->type; + auxdata->type = cookie->type; cachefiles_begin_secure(cache, &saved_cred); cachefiles_update_object_xattr(object, auxdata); @@ -309,10 +323,12 @@ static void cachefiles_drop_object(struct fscache_object *_object) /* * dispose of a reference to an object */ -static void cachefiles_put_object(struct fscache_object *_object) +static void cachefiles_put_object(struct fscache_object *_object, + enum fscache_obj_ref_trace why) { struct cachefiles_object *object; struct fscache_cache *cache; + int u; ASSERT(_object); @@ -328,7 +344,11 @@ static void cachefiles_put_object(struct fscache_object *_object) ASSERTIFCMP(object->fscache.parent, object->fscache.parent->n_children, >, 0); - if (atomic_dec_and_test(&object->usage)) { + u = atomic_dec_return(&object->usage); + trace_cachefiles_ref(object, _object->cookie, + (enum cachefiles_obj_ref_trace)why, u); + ASSERTCMP(u, !=, -1); + if (u == 0) { _debug("- kill object OBJ%x", object->fscache.debug_id); ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)); @@ -421,7 +441,7 @@ static int cachefiles_attr_changed(struct fscache_object *_object) loff_t oi_size; int ret; - _object->cookie->def->get_attr(_object->cookie->netfs_data, &ni_size); + ni_size = _object->store_limit_l; _enter("{OBJ%x},[%llu]", _object->debug_id, (unsigned long long) ni_size); @@ -493,8 +513,7 @@ static void cachefiles_invalidate_object(struct fscache_operation *op) cache = container_of(object->fscache.cache, struct cachefiles_cache, cache); - op->object->cookie->def->get_attr(op->object->cookie->netfs_data, - &ni_size); + ni_size = op->object->store_limit_l; _enter("{OBJ%x},[%llu]", op->object->debug_id, (unsigned long long)ni_size); diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index bb3a02ca9da4..d2f6f996e65a 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -124,6 +124,8 @@ struct cachefiles_xattr { uint8_t data[]; }; +#include <trace/events/cachefiles.h> + /* * note change of state for daemon */ diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c index 711f13d8c2de..f54d3f5b2e40 100644 --- a/fs/cachefiles/main.c +++ b/fs/cachefiles/main.c @@ -22,6 +22,7 @@ #include <linux/statfs.h> #include <linux/sysctl.h> #include <linux/miscdevice.h> +#define CREATE_TRACE_POINTS #include "internal.h" unsigned cachefiles_debug; diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 3978b324cbca..0daa1e3fe0df 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -30,11 +30,11 @@ */ static noinline void __cachefiles_printk_object(struct cachefiles_object *object, - const char *prefix, - u8 *keybuf) + const char *prefix) { struct fscache_cookie *cookie; - unsigned keylen, loop; + const u8 *k; + unsigned loop; pr_err("%sobject: OBJ%x\n", prefix, object->fscache.debug_id); pr_err("%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n", @@ -56,23 +56,16 @@ void __cachefiles_printk_object(struct cachefiles_object *object, object->fscache.cookie->parent, object->fscache.cookie->netfs_data, object->fscache.cookie->flags); - if (keybuf && cookie->def) - keylen = cookie->def->get_key(cookie->netfs_data, keybuf, - CACHEFILES_KEYBUF_SIZE); - else - keylen = 0; + pr_err("%skey=[%u] '", prefix, cookie->key_len); + k = (cookie->key_len <= sizeof(cookie->inline_key)) ? + cookie->inline_key : cookie->key; + for (loop = 0; loop < cookie->key_len; loop++) + pr_cont("%02x", k[loop]); + pr_cont("'\n"); } else { pr_err("%scookie=NULL\n", prefix); - keylen = 0; } spin_unlock(&object->fscache.lock); - - if (keylen) { - pr_err("%skey=[%u] '", prefix, keylen); - for (loop = 0; loop < keylen; loop++) - pr_cont("%02x", keybuf[loop]); - pr_cont("'\n"); - } } /* @@ -81,14 +74,10 @@ void __cachefiles_printk_object(struct cachefiles_object *object, static noinline void cachefiles_printk_object(struct cachefiles_object *object, struct cachefiles_object *xobject) { - u8 *keybuf; - - keybuf = kmalloc(CACHEFILES_KEYBUF_SIZE, GFP_NOIO); if (object) - __cachefiles_printk_object(object, "", keybuf); + __cachefiles_printk_object(object, ""); if (xobject) - __cachefiles_printk_object(xobject, "x", keybuf); - kfree(keybuf); + __cachefiles_printk_object(xobject, "x"); } /* @@ -120,6 +109,7 @@ static void cachefiles_mark_object_buried(struct cachefiles_cache *cache, } write_unlock(&cache->active_lock); + trace_cachefiles_mark_buried(NULL, dentry, why); _leave(" [no owner]"); return; @@ -130,6 +120,8 @@ found_dentry: object->fscache.state->name, dentry); + trace_cachefiles_mark_buried(object, dentry, why); + if (fscache_object_is_live(&object->fscache)) { pr_err("\n"); pr_err("Error: Can't preemptively bury live object\n"); @@ -158,13 +150,15 @@ static int cachefiles_mark_object_active(struct cachefiles_cache *cache, try_again: write_lock(&cache->active_lock); + dentry = object->dentry; + trace_cachefiles_mark_active(object, dentry); + if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) { pr_err("Error: Object already active\n"); cachefiles_printk_object(object, NULL); BUG(); } - dentry = object->dentry; _p = &cache->active_nodes.rb_node; while (*_p) { _parent = *_p; @@ -191,6 +185,8 @@ try_again: /* an old object from a previous incarnation is hogging the slot - we * need to wait for it to be destroyed */ wait_for_old_object: + trace_cachefiles_wait_active(object, dentry, xobject); + if (fscache_object_is_live(&xobject->fscache)) { pr_err("\n"); pr_err("Error: Unexpected object collision\n"); @@ -248,12 +244,12 @@ wait_for_old_object: ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)); - cache->cache.ops->put_object(&xobject->fscache); + cache->cache.ops->put_object(&xobject->fscache, cachefiles_obj_put_wait_retry); goto try_again; requeue: clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); - cache->cache.ops->put_object(&xobject->fscache); + cache->cache.ops->put_object(&xobject->fscache, cachefiles_obj_put_wait_timeo); _leave(" = -ETIMEDOUT"); return -ETIMEDOUT; } @@ -265,6 +261,11 @@ void cachefiles_mark_object_inactive(struct cachefiles_cache *cache, struct cachefiles_object *object, blkcnt_t i_blocks) { + struct dentry *dentry = object->dentry; + struct inode *inode = d_backing_inode(dentry); + + trace_cachefiles_mark_inactive(object, dentry, inode); + write_lock(&cache->active_lock); rb_erase(&object->active_node, &cache->active_nodes); clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); @@ -288,6 +289,7 @@ void cachefiles_mark_object_inactive(struct cachefiles_cache *cache, * - unlocks the directory mutex */ static int cachefiles_bury_object(struct cachefiles_cache *cache, + struct cachefiles_object *object, struct dentry *dir, struct dentry *rep, bool preemptive, @@ -312,6 +314,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, if (ret < 0) { cachefiles_io_error(cache, "Unlink security error"); } else { + trace_cachefiles_unlink(object, rep, why); ret = vfs_unlink(d_inode(dir), rep, NULL); if (preemptive) @@ -413,6 +416,7 @@ try_again: if (ret < 0) { cachefiles_io_error(cache, "Rename security error %d", ret); } else { + trace_cachefiles_rename(object, rep, grave, why); ret = vfs_rename(d_inode(dir), rep, d_inode(cache->graveyard), grave, NULL, 0); if (ret != 0 && ret != -ENOMEM) @@ -458,7 +462,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, /* we need to check that our parent is _still_ our parent - it * may have been renamed */ if (dir == object->dentry->d_parent) { - ret = cachefiles_bury_object(cache, dir, + ret = cachefiles_bury_object(cache, object, dir, object->dentry, false, FSCACHE_OBJECT_WAS_RETIRED); } else { @@ -486,6 +490,7 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent, { struct cachefiles_cache *cache; struct dentry *dir, *next = NULL; + struct inode *inode; struct path path; unsigned long start; const char *name; @@ -529,13 +534,17 @@ lookup_again: start = jiffies; next = lookup_one_len(name, dir, nlen); cachefiles_hist(cachefiles_lookup_histogram, start); - if (IS_ERR(next)) + if (IS_ERR(next)) { + trace_cachefiles_lookup(object, next, NULL); goto lookup_error; + } - _debug("next -> %p %s", next, d_backing_inode(next) ? "positive" : "negative"); + inode = d_backing_inode(next); + trace_cachefiles_lookup(object, next, inode); + _debug("next -> %p %s", next, inode ? "positive" : "negative"); if (!key) - object->new = !d_backing_inode(next); + object->new = !inode; /* if this element of the path doesn't exist, then the lookup phase * failed, and we can release any readers in the certain knowledge that @@ -558,6 +567,8 @@ lookup_again: start = jiffies; ret = vfs_mkdir(d_inode(dir), next, 0); cachefiles_hist(cachefiles_mkdir_histogram, start); + if (!key) + trace_cachefiles_mkdir(object, next, ret); if (ret < 0) goto create_error; @@ -587,6 +598,7 @@ lookup_again: start = jiffies; ret = vfs_create(d_inode(dir), next, S_IFREG, true); cachefiles_hist(cachefiles_create_histogram, start); + trace_cachefiles_create(object, next, ret); if (ret < 0) goto create_error; @@ -629,7 +641,8 @@ lookup_again: * mutex) */ object->dentry = NULL; - ret = cachefiles_bury_object(cache, dir, next, true, + ret = cachefiles_bury_object(cache, object, dir, next, + true, FSCACHE_OBJECT_IS_STALE); dput(next); next = NULL; @@ -955,7 +968,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, /* actually remove the victim (drops the dir mutex) */ _debug("bury"); - ret = cachefiles_bury_object(cache, dir, victim, false, + ret = cachefiles_bury_object(cache, NULL, dir, victim, false, FSCACHE_OBJECT_WAS_CULLED); if (ret < 0) goto error; diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 883bc7bb12c5..5082c8a49686 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -952,6 +952,7 @@ error: * - cache withdrawal is prevented by the caller */ void cachefiles_uncache_page(struct fscache_object *_object, struct page *page) + __releases(&object->fscache.cookie->lock) { struct cachefiles_object *object; struct cachefiles_cache *cache; diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index d31c1a72d8a5..0a29a00aed2e 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -113,6 +113,7 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object, /* attempt to install the cache metadata directly */ _debug("SET #%u", auxdata->len); + clear_bit(FSCACHE_COOKIE_AUX_UPDATED, &object->fscache.cookie->flags); ret = vfs_setxattr(dentry, cachefiles_xattr_cache, &auxdata->type, auxdata->len, XATTR_CREATE); @@ -141,6 +142,7 @@ int cachefiles_update_object_xattr(struct cachefiles_object *object, /* attempt to install the cache metadata directly */ _debug("SET #%u", auxdata->len); + clear_bit(FSCACHE_COOKIE_AUX_UPDATED, &object->fscache.cookie->flags); ret = vfs_setxattr(dentry, cachefiles_xattr_cache, &auxdata->type, auxdata->len, XATTR_REPLACE); @@ -180,7 +182,8 @@ int cachefiles_check_auxdata(struct cachefiles_object *object) goto error; xlen--; - validity = fscache_check_aux(&object->fscache, &auxbuf->data, xlen); + validity = fscache_check_aux(&object->fscache, &auxbuf->data, xlen, + i_size_read(d_backing_inode(dentry))); if (validity != FSCACHE_CHECKAUX_OKAY) goto error; @@ -249,7 +252,8 @@ int cachefiles_check_object_xattr(struct cachefiles_object *object, object->fscache.cookie->def->name, dlen); result = fscache_check_aux(&object->fscache, - &auxbuf->data, dlen); + &auxbuf->data, dlen, + i_size_read(d_backing_inode(dentry))); switch (result) { /* entry okay as is */ diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index 174f5709e508..a699e320393f 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_CEPH_FS) += ceph.o ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ - export.o caps.o snap.o xattr.o \ + export.o caps.o snap.o xattr.o quota.o \ mds_client.o mdsmap.o strings.o ceph_frag.o \ debugfs.o diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index b4336b42ce3b..5f7ad3d0df2e 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -15,6 +15,7 @@ #include "mds_client.h" #include "cache.h" #include <linux/ceph/osd_client.h> +#include <linux/ceph/striper.h> /* * Ceph address space ops. @@ -438,7 +439,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, { struct inode *inode = file_inode(file); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_file_info *ci = file->private_data; + struct ceph_file_info *fi = file->private_data; struct ceph_rw_context *rw_ctx; int rc = 0; int max = 0; @@ -452,7 +453,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, if (rc == 0) goto out; - rw_ctx = ceph_find_rw_context(ci); + rw_ctx = ceph_find_rw_context(fi); max = fsc->mount_options->rsize >> PAGE_SHIFT; dout("readpages %p file %p ctx %p nr_pages %d max %d\n", inode, file, rw_ctx, nr_pages, max); @@ -800,7 +801,7 @@ static int ceph_writepages_start(struct address_space *mapping, struct ceph_osd_request *req = NULL; struct ceph_writeback_ctl ceph_wbc; bool should_loop, range_whole = false; - bool stop, done = false; + bool done = false; dout("writepages_start %p (mode=%s)\n", inode, wbc->sync_mode == WB_SYNC_NONE ? "NONE" : @@ -856,7 +857,7 @@ retry: * in that range can be associated with newer snapc. * They are not writeable until we write all dirty pages * associated with 'snapc' get written */ - if (index > 0 || wbc->sync_mode != WB_SYNC_NONE) + if (index > 0) should_loop = true; dout(" non-head snapc, range whole\n"); } @@ -864,8 +865,7 @@ retry: ceph_put_snap_context(last_snapc); last_snapc = snapc; - stop = false; - while (!stop && index <= end) { + while (!done && index <= end) { int num_ops = 0, op_idx; unsigned i, pvec_pages, max_pages, locked_pages = 0; struct page **pages = NULL, **data_pages; @@ -898,16 +898,30 @@ get_more_pages: unlock_page(page); continue; } - if (strip_unit_end && (page->index > strip_unit_end)) { - dout("end of strip unit %p\n", page); + /* only if matching snap context */ + pgsnapc = page_snap_context(page); + if (pgsnapc != snapc) { + dout("page snapc %p %lld != oldest %p %lld\n", + pgsnapc, pgsnapc->seq, snapc, snapc->seq); + if (!should_loop && + !ceph_wbc.head_snapc && + wbc->sync_mode != WB_SYNC_NONE) + should_loop = true; unlock_page(page); - break; + continue; } if (page_offset(page) >= ceph_wbc.i_size) { dout("%p page eof %llu\n", page, ceph_wbc.i_size); - /* not done if range_cyclic */ - stop = true; + if (ceph_wbc.size_stable || + page_offset(page) >= i_size_read(inode)) + mapping->a_ops->invalidatepage(page, + 0, PAGE_SIZE); + unlock_page(page); + continue; + } + if (strip_unit_end && (page->index > strip_unit_end)) { + dout("end of strip unit %p\n", page); unlock_page(page); break; } @@ -921,15 +935,6 @@ get_more_pages: wait_on_page_writeback(page); } - /* only if matching snap context */ - pgsnapc = page_snap_context(page); - if (pgsnapc != snapc) { - dout("page snapc %p %lld != oldest %p %lld\n", - pgsnapc, pgsnapc->seq, snapc, snapc->seq); - unlock_page(page); - continue; - } - if (!clear_page_dirty_for_io(page)) { dout("%p !clear_page_dirty_for_io\n", page); unlock_page(page); @@ -945,19 +950,15 @@ get_more_pages: if (locked_pages == 0) { u64 objnum; u64 objoff; + u32 xlen; /* prepare async write request */ offset = (u64)page_offset(page); - len = wsize; - - rc = ceph_calc_file_object_mapping(&ci->i_layout, - offset, len, - &objnum, &objoff, - &len); - if (rc < 0) { - unlock_page(page); - break; - } + ceph_calc_file_object_mapping(&ci->i_layout, + offset, wsize, + &objnum, &objoff, + &xlen); + len = xlen; num_ops = 1; strip_unit_end = page->index + @@ -1146,7 +1147,7 @@ new_request: * we tagged for writeback prior to entering this loop. */ if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) - done = stop = true; + done = true; release_pvec_pages: dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr, diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index a3ab265d3215..bb524c880b1e 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -27,7 +27,6 @@ struct ceph_aux_inode { u64 version; struct timespec mtime; - loff_t size; }; struct fscache_netfs ceph_cache_netfs = { @@ -41,37 +40,18 @@ static LIST_HEAD(ceph_fscache_list); struct ceph_fscache_entry { struct list_head list; struct fscache_cookie *fscache; - struct ceph_fsid fsid; size_t uniq_len; + /* The following members must be last */ + struct ceph_fsid fsid; char uniquifier[0]; }; -static uint16_t ceph_fscache_session_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t maxbuf) -{ - const struct ceph_fs_client* fsc = cookie_netfs_data; - const char *fscache_uniq = fsc->mount_options->fscache_uniq; - uint16_t fsid_len, uniq_len; - - fsid_len = sizeof(fsc->client->fsid); - uniq_len = fscache_uniq ? strlen(fscache_uniq) : 0; - if (fsid_len + uniq_len > maxbuf) - return 0; - - memcpy(buffer, &fsc->client->fsid, fsid_len); - if (uniq_len) - memcpy(buffer + fsid_len, fscache_uniq, uniq_len); - - return fsid_len + uniq_len; -} - static const struct fscache_cookie_def ceph_fscache_fsid_object_def = { .name = "CEPH.fsid", .type = FSCACHE_COOKIE_TYPE_INDEX, - .get_key = ceph_fscache_session_get_key, }; -int ceph_fscache_register(void) +int __init ceph_fscache_register(void) { return fscache_register_netfs(&ceph_cache_netfs); } @@ -110,16 +90,19 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc) goto out_unlock; } + memcpy(&ent->fsid, fsid, sizeof(*fsid)); + if (uniq_len > 0) { + memcpy(&ent->uniquifier, fscache_uniq, uniq_len); + ent->uniq_len = uniq_len; + } + fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index, &ceph_fscache_fsid_object_def, - fsc, true); + &ent->fsid, sizeof(ent->fsid) + uniq_len, + NULL, 0, + fsc, 0, true); if (fsc->fscache) { - memcpy(&ent->fsid, fsid, sizeof(*fsid)); - if (uniq_len > 0) { - memcpy(&ent->uniquifier, fscache_uniq, uniq_len); - ent->uniq_len = uniq_len; - } ent->fscache = fsc->fscache; list_add_tail(&ent->list, &ceph_fscache_list); } else { @@ -133,73 +116,32 @@ out_unlock: return err; } -static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t maxbuf) -{ - const struct ceph_inode_info* ci = cookie_netfs_data; - uint16_t klen; - - /* use ceph virtual inode (id + snapshot) */ - klen = sizeof(ci->i_vino); - if (klen > maxbuf) - return 0; - - memcpy(buffer, &ci->i_vino, klen); - return klen; -} - -static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - struct ceph_aux_inode aux; - const struct ceph_inode_info* ci = cookie_netfs_data; - const struct inode* inode = &ci->vfs_inode; - - memset(&aux, 0, sizeof(aux)); - aux.version = ci->i_version; - aux.mtime = inode->i_mtime; - aux.size = i_size_read(inode); - - memcpy(buffer, &aux, sizeof(aux)); - - return sizeof(aux); -} - -static void ceph_fscache_inode_get_attr(const void *cookie_netfs_data, - uint64_t *size) -{ - const struct ceph_inode_info* ci = cookie_netfs_data; - *size = i_size_read(&ci->vfs_inode); -} - static enum fscache_checkaux ceph_fscache_inode_check_aux( - void *cookie_netfs_data, const void *data, uint16_t dlen) + void *cookie_netfs_data, const void *data, uint16_t dlen, + loff_t object_size) { struct ceph_aux_inode aux; struct ceph_inode_info* ci = cookie_netfs_data; struct inode* inode = &ci->vfs_inode; - if (dlen != sizeof(aux)) + if (dlen != sizeof(aux) || + i_size_read(inode) != object_size) return FSCACHE_CHECKAUX_OBSOLETE; memset(&aux, 0, sizeof(aux)); aux.version = ci->i_version; aux.mtime = inode->i_mtime; - aux.size = i_size_read(inode); if (memcmp(data, &aux, sizeof(aux)) != 0) return FSCACHE_CHECKAUX_OBSOLETE; - dout("ceph inode 0x%p cached okay", ci); + dout("ceph inode 0x%p cached okay\n", ci); return FSCACHE_CHECKAUX_OKAY; } static const struct fscache_cookie_def ceph_fscache_inode_object_def = { .name = "CEPH.inode", .type = FSCACHE_COOKIE_TYPE_DATAFILE, - .get_key = ceph_fscache_inode_get_key, - .get_attr = ceph_fscache_inode_get_attr, - .get_aux = ceph_fscache_inode_get_aux, .check_aux = ceph_fscache_inode_check_aux, }; @@ -207,6 +149,7 @@ void ceph_fscache_register_inode_cookie(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_aux_inode aux; /* No caching for filesystem */ if (!fsc->fscache) @@ -218,9 +161,14 @@ void ceph_fscache_register_inode_cookie(struct inode *inode) inode_lock_nested(inode, I_MUTEX_CHILD); if (!ci->fscache) { + memset(&aux, 0, sizeof(aux)); + aux.version = ci->i_version; + aux.mtime = inode->i_mtime; ci->fscache = fscache_acquire_cookie(fsc->fscache, - &ceph_fscache_inode_object_def, - ci, false); + &ceph_fscache_inode_object_def, + &ci->i_vino, sizeof(ci->i_vino), + &aux, sizeof(aux), + ci, i_size_read(inode), false); } inode_unlock(inode); } @@ -235,7 +183,7 @@ void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) ci->fscache = NULL; fscache_uncache_all_inode_pages(cookie, &ci->vfs_inode); - fscache_relinquish_cookie(cookie, 0); + fscache_relinquish_cookie(cookie, &ci->i_vino, false); } static bool ceph_fscache_can_enable(void *data) @@ -254,11 +202,11 @@ void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp) if (inode_is_open_for_write(inode)) { dout("fscache_file_set_cookie %p %p disabling cache\n", inode, filp); - fscache_disable_cookie(ci->fscache, false); + fscache_disable_cookie(ci->fscache, &ci->i_vino, false); fscache_uncache_all_inode_pages(ci->fscache, inode); } else { - fscache_enable_cookie(ci->fscache, ceph_fscache_can_enable, - inode); + fscache_enable_cookie(ci->fscache, &ci->i_vino, i_size_read(inode), + ceph_fscache_can_enable, inode); if (fscache_cookie_enabled(ci->fscache)) { dout("fscache_file_set_cookie %p %p enabling cache\n", inode, filp); @@ -351,7 +299,8 @@ void ceph_readpage_to_fscache(struct inode *inode, struct page *page) if (!cache_valid(ci)) return; - ret = fscache_write_page(ci->fscache, page, GFP_KERNEL); + ret = fscache_write_page(ci->fscache, page, i_size_read(inode), + GFP_KERNEL); if (ret) fscache_uncache_page(ci->fscache, page); } @@ -385,7 +334,7 @@ void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) WARN_ON_ONCE(!found); mutex_unlock(&ceph_fscache_lock); - __fscache_relinquish_cookie(fsc->fscache, 0); + __fscache_relinquish_cookie(fsc->fscache, NULL, false); } fsc->fscache = NULL; } @@ -402,7 +351,7 @@ void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci) * truncate while the caller holds CEPH_CAP_FILE_RD */ mutex_lock(&ci->i_truncate_mutex); if (!cache_valid(ci)) { - if (fscache_check_consistency(ci->fscache)) + if (fscache_check_consistency(ci->fscache, &ci->i_vino)) fscache_invalidate(ci->fscache); spin_lock(&ci->i_ceph_lock); ci->i_fscache_gen = ci->i_rdcache_gen; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 0e5bd3e3344e..23dbfae16156 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -184,36 +184,54 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc, mdsc->caps_avail_count); spin_unlock(&mdsc->caps_list_lock); - for (i = have; i < need; i++) { -retry: + for (i = have; i < need; ) { cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); - if (!cap) { - if (!trimmed) { - for (j = 0; j < mdsc->max_sessions; j++) { - s = __ceph_lookup_mds_session(mdsc, j); - if (!s) - continue; - mutex_unlock(&mdsc->mutex); + if (cap) { + list_add(&cap->caps_item, &newcaps); + alloc++; + i++; + continue; + } - mutex_lock(&s->s_mutex); - max_caps = s->s_nr_caps - (need - i); - ceph_trim_caps(mdsc, s, max_caps); - mutex_unlock(&s->s_mutex); + if (!trimmed) { + for (j = 0; j < mdsc->max_sessions; j++) { + s = __ceph_lookup_mds_session(mdsc, j); + if (!s) + continue; + mutex_unlock(&mdsc->mutex); - ceph_put_mds_session(s); - mutex_lock(&mdsc->mutex); - } - trimmed = true; - goto retry; - } else { - pr_warn("reserve caps ctx=%p ENOMEM " - "need=%d got=%d\n", - ctx, need, have + alloc); - goto out_nomem; + mutex_lock(&s->s_mutex); + max_caps = s->s_nr_caps - (need - i); + ceph_trim_caps(mdsc, s, max_caps); + mutex_unlock(&s->s_mutex); + + ceph_put_mds_session(s); + mutex_lock(&mdsc->mutex); } + trimmed = true; + + spin_lock(&mdsc->caps_list_lock); + if (mdsc->caps_avail_count) { + int more_have; + if (mdsc->caps_avail_count >= need - i) + more_have = need - i; + else + more_have = mdsc->caps_avail_count; + + i += more_have; + have += more_have; + mdsc->caps_avail_count -= more_have; + mdsc->caps_reserve_count += more_have; + + } + spin_unlock(&mdsc->caps_list_lock); + + continue; } - list_add(&cap->caps_item, &newcaps); - alloc++; + + pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n", + ctx, need, have + alloc); + goto out_nomem; } BUG_ON(have + alloc != need); @@ -234,16 +252,28 @@ retry: return 0; out_nomem: + + spin_lock(&mdsc->caps_list_lock); + mdsc->caps_avail_count += have; + mdsc->caps_reserve_count -= have; + while (!list_empty(&newcaps)) { cap = list_first_entry(&newcaps, struct ceph_cap, caps_item); list_del(&cap->caps_item); - kmem_cache_free(ceph_cap_cachep, cap); + + /* Keep some preallocated caps around (ceph_min_count), to + * avoid lots of free/alloc churn. */ + if (mdsc->caps_avail_count >= + mdsc->caps_reserve_count + mdsc->caps_min_count) { + kmem_cache_free(ceph_cap_cachep, cap); + } else { + mdsc->caps_avail_count++; + mdsc->caps_total_count++; + list_add(&cap->caps_item, &mdsc->caps_list); + } } - spin_lock(&mdsc->caps_list_lock); - mdsc->caps_avail_count += have; - mdsc->caps_reserve_count -= have; BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + mdsc->caps_reserve_count + mdsc->caps_avail_count); @@ -254,12 +284,26 @@ out_nomem: int ceph_unreserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx) { + int i; + struct ceph_cap *cap; + dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count); if (ctx->count) { spin_lock(&mdsc->caps_list_lock); BUG_ON(mdsc->caps_reserve_count < ctx->count); mdsc->caps_reserve_count -= ctx->count; - mdsc->caps_avail_count += ctx->count; + if (mdsc->caps_avail_count >= + mdsc->caps_reserve_count + mdsc->caps_min_count) { + mdsc->caps_total_count -= ctx->count; + for (i = 0; i < ctx->count; i++) { + cap = list_first_entry(&mdsc->caps_list, + struct ceph_cap, caps_item); + list_del(&cap->caps_item); + kmem_cache_free(ceph_cap_cachep, cap); + } + } else { + mdsc->caps_avail_count += ctx->count; + } ctx->count = 0; dout("unreserve caps %d = %d used + %d resv + %d avail\n", mdsc->caps_total_count, mdsc->caps_use_count, @@ -285,7 +329,23 @@ struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, mdsc->caps_use_count++; mdsc->caps_total_count++; spin_unlock(&mdsc->caps_list_lock); + } else { + spin_lock(&mdsc->caps_list_lock); + if (mdsc->caps_avail_count) { + BUG_ON(list_empty(&mdsc->caps_list)); + + mdsc->caps_avail_count--; + mdsc->caps_use_count++; + cap = list_first_entry(&mdsc->caps_list, + struct ceph_cap, caps_item); + list_del(&cap->caps_item); + + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + mdsc->caps_avail_count); + } + spin_unlock(&mdsc->caps_list_lock); } + return cap; } @@ -341,6 +401,8 @@ void ceph_reservation_status(struct ceph_fs_client *fsc, { struct ceph_mds_client *mdsc = fsc->mdsc; + spin_lock(&mdsc->caps_list_lock); + if (total) *total = mdsc->caps_total_count; if (avail) @@ -351,6 +413,8 @@ void ceph_reservation_status(struct ceph_fs_client *fsc, *reserved = mdsc->caps_reserve_count; if (min) *min = mdsc->caps_min_count; + + spin_unlock(&mdsc->caps_list_lock); } /* @@ -639,9 +703,11 @@ void ceph_add_cap(struct inode *inode, } spin_lock(&realm->inodes_with_caps_lock); - ci->i_snap_realm = realm; list_add(&ci->i_snap_realm_item, &realm->inodes_with_caps); + ci->i_snap_realm = realm; + if (realm->ino == ci->i_vino.ino) + realm->inode = inode; spin_unlock(&realm->inodes_with_caps_lock); if (oldrealm) diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 644def813754..abdf98deeec4 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -260,7 +260,7 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) goto out; fsc->debugfs_mdsmap = debugfs_create_file("mdsmap", - 0600, + 0400, fsc->client->debugfs_dir, fsc, &mdsmap_show_fops); @@ -268,7 +268,7 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) goto out; fsc->debugfs_mds_sessions = debugfs_create_file("mds_sessions", - 0600, + 0400, fsc->client->debugfs_dir, fsc, &mds_sessions_show_fops); @@ -276,7 +276,7 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) goto out; fsc->debugfs_mdsc = debugfs_create_file("mdsc", - 0600, + 0400, fsc->client->debugfs_dir, fsc, &mdsc_show_fops); @@ -292,7 +292,7 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) goto out; fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru", - 0600, + 0400, fsc->client->debugfs_dir, fsc, &dentry_lru_show_fops); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index f1d9c6cc0491..1a78dd6f8bf2 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -2,7 +2,6 @@ #include <linux/ceph/ceph_debug.h> #include <linux/spinlock.h> -#include <linux/fs_struct.h> #include <linux/namei.h> #include <linux/slab.h> #include <linux/sched.h> @@ -102,18 +101,18 @@ static int fpos_cmp(loff_t l, loff_t r) * regardless of what dir changes take place on the * server. */ -static int note_last_dentry(struct ceph_file_info *fi, const char *name, +static int note_last_dentry(struct ceph_dir_file_info *dfi, const char *name, int len, unsigned next_offset) { char *buf = kmalloc(len+1, GFP_KERNEL); if (!buf) return -ENOMEM; - kfree(fi->last_name); - fi->last_name = buf; - memcpy(fi->last_name, name, len); - fi->last_name[len] = 0; - fi->next_offset = next_offset; - dout("note_last_dentry '%s'\n", fi->last_name); + kfree(dfi->last_name); + dfi->last_name = buf; + memcpy(dfi->last_name, name, len); + dfi->last_name[len] = 0; + dfi->next_offset = next_offset; + dout("note_last_dentry '%s'\n", dfi->last_name); return 0; } @@ -175,7 +174,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx, static int __dcache_readdir(struct file *file, struct dir_context *ctx, int shared_gen) { - struct ceph_file_info *fi = file->private_data; + struct ceph_dir_file_info *dfi = file->private_data; struct dentry *parent = file->f_path.dentry; struct inode *dir = d_inode(parent); struct dentry *dentry, *last = NULL; @@ -222,7 +221,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, bool emit_dentry = false; dentry = __dcache_find_get_entry(parent, idx++, &cache_ctl); if (!dentry) { - fi->flags |= CEPH_F_ATEND; + dfi->file_info.flags |= CEPH_F_ATEND; err = 0; break; } @@ -273,33 +272,33 @@ out: if (last) { int ret; di = ceph_dentry(last); - ret = note_last_dentry(fi, last->d_name.name, last->d_name.len, + ret = note_last_dentry(dfi, last->d_name.name, last->d_name.len, fpos_off(di->offset) + 1); if (ret < 0) err = ret; dput(last); /* last_name no longer match cache index */ - if (fi->readdir_cache_idx >= 0) { - fi->readdir_cache_idx = -1; - fi->dir_release_count = 0; + if (dfi->readdir_cache_idx >= 0) { + dfi->readdir_cache_idx = -1; + dfi->dir_release_count = 0; } } return err; } -static bool need_send_readdir(struct ceph_file_info *fi, loff_t pos) +static bool need_send_readdir(struct ceph_dir_file_info *dfi, loff_t pos) { - if (!fi->last_readdir) + if (!dfi->last_readdir) return true; if (is_hash_order(pos)) - return !ceph_frag_contains_value(fi->frag, fpos_hash(pos)); + return !ceph_frag_contains_value(dfi->frag, fpos_hash(pos)); else - return fi->frag != fpos_frag(pos); + return dfi->frag != fpos_frag(pos); } static int ceph_readdir(struct file *file, struct dir_context *ctx) { - struct ceph_file_info *fi = file->private_data; + struct ceph_dir_file_info *dfi = file->private_data; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); @@ -310,7 +309,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) struct ceph_mds_reply_info_parsed *rinfo; dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos); - if (fi->flags & CEPH_F_ATEND) + if (dfi->file_info.flags & CEPH_F_ATEND) return 0; /* always start with . and .. */ @@ -351,15 +350,15 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) /* proceed with a normal readdir */ more: /* do we have the correct frag content buffered? */ - if (need_send_readdir(fi, ctx->pos)) { + if (need_send_readdir(dfi, ctx->pos)) { struct ceph_mds_request *req; int op = ceph_snap(inode) == CEPH_SNAPDIR ? CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; /* discard old result, if any */ - if (fi->last_readdir) { - ceph_mdsc_put_request(fi->last_readdir); - fi->last_readdir = NULL; + if (dfi->last_readdir) { + ceph_mdsc_put_request(dfi->last_readdir); + dfi->last_readdir = NULL; } if (is_hash_order(ctx->pos)) { @@ -373,7 +372,7 @@ more: } dout("readdir fetching %llx.%llx frag %x offset '%s'\n", - ceph_vinop(inode), frag, fi->last_name); + ceph_vinop(inode), frag, dfi->last_name); req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); @@ -389,8 +388,8 @@ more: __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); req->r_inode_drop = CEPH_CAP_FILE_EXCL; } - if (fi->last_name) { - req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL); + if (dfi->last_name) { + req->r_path2 = kstrdup(dfi->last_name, GFP_KERNEL); if (!req->r_path2) { ceph_mdsc_put_request(req); return -ENOMEM; @@ -400,10 +399,10 @@ more: cpu_to_le32(fpos_hash(ctx->pos)); } - req->r_dir_release_cnt = fi->dir_release_count; - req->r_dir_ordered_cnt = fi->dir_ordered_count; - req->r_readdir_cache_idx = fi->readdir_cache_idx; - req->r_readdir_offset = fi->next_offset; + req->r_dir_release_cnt = dfi->dir_release_count; + req->r_dir_ordered_cnt = dfi->dir_ordered_count; + req->r_readdir_cache_idx = dfi->readdir_cache_idx; + req->r_readdir_offset = dfi->next_offset; req->r_args.readdir.frag = cpu_to_le32(frag); req->r_args.readdir.flags = cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS); @@ -427,35 +426,35 @@ more: if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { frag = le32_to_cpu(rinfo->dir_dir->frag); if (!rinfo->hash_order) { - fi->next_offset = req->r_readdir_offset; + dfi->next_offset = req->r_readdir_offset; /* adjust ctx->pos to beginning of frag */ ctx->pos = ceph_make_fpos(frag, - fi->next_offset, + dfi->next_offset, false); } } - fi->frag = frag; - fi->last_readdir = req; + dfi->frag = frag; + dfi->last_readdir = req; if (test_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags)) { - fi->readdir_cache_idx = req->r_readdir_cache_idx; - if (fi->readdir_cache_idx < 0) { + dfi->readdir_cache_idx = req->r_readdir_cache_idx; + if (dfi->readdir_cache_idx < 0) { /* preclude from marking dir ordered */ - fi->dir_ordered_count = 0; + dfi->dir_ordered_count = 0; } else if (ceph_frag_is_leftmost(frag) && - fi->next_offset == 2) { + dfi->next_offset == 2) { /* note dir version at start of readdir so * we can tell if any dentries get dropped */ - fi->dir_release_count = req->r_dir_release_cnt; - fi->dir_ordered_count = req->r_dir_ordered_cnt; + dfi->dir_release_count = req->r_dir_release_cnt; + dfi->dir_ordered_count = req->r_dir_ordered_cnt; } } else { - dout("readdir !did_prepopulate"); + dout("readdir !did_prepopulate\n"); /* disable readdir cache */ - fi->readdir_cache_idx = -1; + dfi->readdir_cache_idx = -1; /* preclude from marking dir complete */ - fi->dir_release_count = 0; + dfi->dir_release_count = 0; } /* note next offset and last dentry name */ @@ -464,19 +463,19 @@ more: rinfo->dir_entries + (rinfo->dir_nr-1); unsigned next_offset = req->r_reply_info.dir_end ? 2 : (fpos_off(rde->offset) + 1); - err = note_last_dentry(fi, rde->name, rde->name_len, + err = note_last_dentry(dfi, rde->name, rde->name_len, next_offset); if (err) return err; } else if (req->r_reply_info.dir_end) { - fi->next_offset = 2; + dfi->next_offset = 2; /* keep last name */ } } - rinfo = &fi->last_readdir->r_reply_info; + rinfo = &dfi->last_readdir->r_reply_info; dout("readdir frag %x num %d pos %llx chunk first %llx\n", - fi->frag, rinfo->dir_nr, ctx->pos, + dfi->frag, rinfo->dir_nr, ctx->pos, rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL); i = 0; @@ -520,52 +519,55 @@ more: ctx->pos++; } - ceph_mdsc_put_request(fi->last_readdir); - fi->last_readdir = NULL; + ceph_mdsc_put_request(dfi->last_readdir); + dfi->last_readdir = NULL; - if (fi->next_offset > 2) { - frag = fi->frag; + if (dfi->next_offset > 2) { + frag = dfi->frag; goto more; } /* more frags? */ - if (!ceph_frag_is_rightmost(fi->frag)) { - frag = ceph_frag_next(fi->frag); + if (!ceph_frag_is_rightmost(dfi->frag)) { + frag = ceph_frag_next(dfi->frag); if (is_hash_order(ctx->pos)) { loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag), - fi->next_offset, true); + dfi->next_offset, true); if (new_pos > ctx->pos) ctx->pos = new_pos; /* keep last_name */ } else { - ctx->pos = ceph_make_fpos(frag, fi->next_offset, false); - kfree(fi->last_name); - fi->last_name = NULL; + ctx->pos = ceph_make_fpos(frag, dfi->next_offset, + false); + kfree(dfi->last_name); + dfi->last_name = NULL; } dout("readdir next frag is %x\n", frag); goto more; } - fi->flags |= CEPH_F_ATEND; + dfi->file_info.flags |= CEPH_F_ATEND; /* * if dir_release_count still matches the dir, no dentries * were released during the whole readdir, and we should have * the complete dir contents in our cache. */ - if (atomic64_read(&ci->i_release_count) == fi->dir_release_count) { + if (atomic64_read(&ci->i_release_count) == + dfi->dir_release_count) { spin_lock(&ci->i_ceph_lock); - if (fi->dir_ordered_count == atomic64_read(&ci->i_ordered_count)) { + if (dfi->dir_ordered_count == + atomic64_read(&ci->i_ordered_count)) { dout(" marking %p complete and ordered\n", inode); /* use i_size to track number of entries in * readdir cache */ - BUG_ON(fi->readdir_cache_idx < 0); - i_size_write(inode, fi->readdir_cache_idx * + BUG_ON(dfi->readdir_cache_idx < 0); + i_size_write(inode, dfi->readdir_cache_idx * sizeof(struct dentry*)); } else { dout(" marking %p complete\n", inode); } - __ceph_dir_set_complete(ci, fi->dir_release_count, - fi->dir_ordered_count); + __ceph_dir_set_complete(ci, dfi->dir_release_count, + dfi->dir_ordered_count); spin_unlock(&ci->i_ceph_lock); } @@ -573,25 +575,25 @@ more: return 0; } -static void reset_readdir(struct ceph_file_info *fi) +static void reset_readdir(struct ceph_dir_file_info *dfi) { - if (fi->last_readdir) { - ceph_mdsc_put_request(fi->last_readdir); - fi->last_readdir = NULL; + if (dfi->last_readdir) { + ceph_mdsc_put_request(dfi->last_readdir); + dfi->last_readdir = NULL; } - kfree(fi->last_name); - fi->last_name = NULL; - fi->dir_release_count = 0; - fi->readdir_cache_idx = -1; - fi->next_offset = 2; /* compensate for . and .. */ - fi->flags &= ~CEPH_F_ATEND; + kfree(dfi->last_name); + dfi->last_name = NULL; + dfi->dir_release_count = 0; + dfi->readdir_cache_idx = -1; + dfi->next_offset = 2; /* compensate for . and .. */ + dfi->file_info.flags &= ~CEPH_F_ATEND; } /* * discard buffered readdir content on seekdir(0), or seek to new frag, * or seek prior to current chunk */ -static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos) +static bool need_reset_readdir(struct ceph_dir_file_info *dfi, loff_t new_pos) { struct ceph_mds_reply_info_parsed *rinfo; loff_t chunk_offset; @@ -600,10 +602,10 @@ static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos) if (is_hash_order(new_pos)) { /* no need to reset last_name for a forward seek when * dentries are sotred in hash order */ - } else if (fi->frag != fpos_frag(new_pos)) { + } else if (dfi->frag != fpos_frag(new_pos)) { return true; } - rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL; + rinfo = dfi->last_readdir ? &dfi->last_readdir->r_reply_info : NULL; if (!rinfo || !rinfo->dir_nr) return true; chunk_offset = rinfo->dir_entries[0].offset; @@ -613,7 +615,7 @@ static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos) static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) { - struct ceph_file_info *fi = file->private_data; + struct ceph_dir_file_info *dfi = file->private_data; struct inode *inode = file->f_mapping->host; loff_t retval; @@ -631,20 +633,20 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) } if (offset >= 0) { - if (need_reset_readdir(fi, offset)) { + if (need_reset_readdir(dfi, offset)) { dout("dir_llseek dropping %p content\n", file); - reset_readdir(fi); + reset_readdir(dfi); } else if (is_hash_order(offset) && offset > file->f_pos) { /* for hash offset, we don't know if a forward seek * is within same frag */ - fi->dir_release_count = 0; - fi->readdir_cache_idx = -1; + dfi->dir_release_count = 0; + dfi->readdir_cache_idx = -1; } if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; - fi->flags &= ~CEPH_F_ATEND; + dfi->file_info.flags &= ~CEPH_F_ATEND; } retval = offset; } @@ -825,6 +827,9 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry, if (ceph_snap(dir) != CEPH_NOSNAP) return -EROFS; + if (ceph_quota_is_max_files_exceeded(dir)) + return -EDQUOT; + err = ceph_pre_init_acls(dir, &mode, &acls); if (err < 0) return err; @@ -878,6 +883,9 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry, if (ceph_snap(dir) != CEPH_NOSNAP) return -EROFS; + if (ceph_quota_is_max_files_exceeded(dir)) + return -EDQUOT; + dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS); if (IS_ERR(req)) { @@ -927,6 +935,12 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) goto out; } + if (op == CEPH_MDS_OP_MKDIR && + ceph_quota_is_max_files_exceeded(dir)) { + err = -EDQUOT; + goto out; + } + mode |= S_IFDIR; err = ceph_pre_init_acls(dir, &mode, &acls); if (err < 0) @@ -1066,6 +1080,11 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, else return -EROFS; } + /* don't allow cross-quota renames */ + if ((old_dir != new_dir) && + (!ceph_quota_is_same_realm(old_dir, new_dir))) + return -EXDEV; + dout("rename dir %p dentry %p to dir %p dentry %p\n", old_dir, old_dentry, new_dir, new_dentry); req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); @@ -1352,7 +1371,7 @@ static void ceph_d_prune(struct dentry *dentry) static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, loff_t *ppos) { - struct ceph_file_info *cf = file->private_data; + struct ceph_dir_file_info *dfi = file->private_data; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); int left; @@ -1361,12 +1380,12 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) return -EISDIR; - if (!cf->dir_info) { - cf->dir_info = kmalloc(bufsize, GFP_KERNEL); - if (!cf->dir_info) + if (!dfi->dir_info) { + dfi->dir_info = kmalloc(bufsize, GFP_KERNEL); + if (!dfi->dir_info) return -ENOMEM; - cf->dir_info_len = - snprintf(cf->dir_info, bufsize, + dfi->dir_info_len = + snprintf(dfi->dir_info, bufsize, "entries: %20lld\n" " files: %20lld\n" " subdirs: %20lld\n" @@ -1386,10 +1405,10 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, (long)ci->i_rctime.tv_nsec); } - if (*ppos >= cf->dir_info_len) + if (*ppos >= dfi->dir_info_len) return 0; - size = min_t(unsigned, size, cf->dir_info_len-*ppos); - left = copy_to_user(buf, cf->dir_info + *ppos, size); + size = min_t(unsigned, size, dfi->dir_info_len-*ppos); + left = copy_to_user(buf, dfi->dir_info + *ppos, size); if (left == size) return -EFAULT; *ppos += (size - left); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index b67eec3532a1..f85040d73e3d 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -30,6 +30,8 @@ static __le32 ceph_flags_sys2wire(u32 flags) break; } + flags &= ~O_ACCMODE; + #define ceph_sys2wire(a) if (flags & a) { wire_flags |= CEPH_##a; flags &= ~a; } ceph_sys2wire(O_CREAT); @@ -41,7 +43,7 @@ static __le32 ceph_flags_sys2wire(u32 flags) #undef ceph_sys2wire if (flags) - dout("unused open flags: %x", flags); + dout("unused open flags: %x\n", flags); return cpu_to_le32(wire_flags); } @@ -159,13 +161,50 @@ out: return req; } +static int ceph_init_file_info(struct inode *inode, struct file *file, + int fmode, bool isdir) +{ + struct ceph_file_info *fi; + + dout("%s %p %p 0%o (%s)\n", __func__, inode, file, + inode->i_mode, isdir ? "dir" : "regular"); + BUG_ON(inode->i_fop->release != ceph_release); + + if (isdir) { + struct ceph_dir_file_info *dfi = + kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL); + if (!dfi) { + ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ + return -ENOMEM; + } + + file->private_data = dfi; + fi = &dfi->file_info; + dfi->next_offset = 2; + dfi->readdir_cache_idx = -1; + } else { + fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); + if (!fi) { + ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ + return -ENOMEM; + } + + file->private_data = fi; + } + + fi->fmode = fmode; + spin_lock_init(&fi->rw_contexts_lock); + INIT_LIST_HEAD(&fi->rw_contexts); + + return 0; +} + /* * initialize private struct file data. * if we fail, clean up by dropping fmode reference on the ceph_inode */ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) { - struct ceph_file_info *cf; int ret = 0; switch (inode->i_mode & S_IFMT) { @@ -173,22 +212,10 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) ceph_fscache_register_inode_cookie(inode); ceph_fscache_file_set_cookie(inode, file); case S_IFDIR: - dout("init_file %p %p 0%o (regular)\n", inode, file, - inode->i_mode); - cf = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); - if (!cf) { - ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ - return -ENOMEM; - } - cf->fmode = fmode; - - spin_lock_init(&cf->rw_contexts_lock); - INIT_LIST_HEAD(&cf->rw_contexts); - - cf->next_offset = 2; - cf->readdir_cache_idx = -1; - file->private_data = cf; - BUG_ON(inode->i_fop->release != ceph_release); + ret = ceph_init_file_info(inode, file, fmode, + S_ISDIR(inode->i_mode)); + if (ret) + return ret; break; case S_IFLNK: @@ -278,11 +305,11 @@ int ceph_open(struct inode *inode, struct file *file) struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; - struct ceph_file_info *cf = file->private_data; + struct ceph_file_info *fi = file->private_data; int err; int flags, fmode, wanted; - if (cf) { + if (fi) { dout("open file %p is already opened\n", file); return 0; } @@ -375,7 +402,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, struct ceph_mds_request *req; struct dentry *dn; struct ceph_acls_info acls = {}; - int mask; + int mask; int err; dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n", @@ -386,6 +413,8 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, return -ENAMETOOLONG; if (flags & O_CREAT) { + if (ceph_quota_is_max_files_exceeded(dir)) + return -EDQUOT; err = ceph_pre_init_acls(dir, &mode, &acls); if (err < 0) return err; @@ -460,16 +489,27 @@ out_acl: int ceph_release(struct inode *inode, struct file *file) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_file_info *cf = file->private_data; - dout("release inode %p file %p\n", inode, file); - ceph_put_fmode(ci, cf->fmode); - if (cf->last_readdir) - ceph_mdsc_put_request(cf->last_readdir); - kfree(cf->last_name); - kfree(cf->dir_info); - WARN_ON(!list_empty(&cf->rw_contexts)); - kmem_cache_free(ceph_file_cachep, cf); + if (S_ISDIR(inode->i_mode)) { + struct ceph_dir_file_info *dfi = file->private_data; + dout("release inode %p dir file %p\n", inode, file); + WARN_ON(!list_empty(&dfi->file_info.rw_contexts)); + + ceph_put_fmode(ci, dfi->file_info.fmode); + + if (dfi->last_readdir) + ceph_mdsc_put_request(dfi->last_readdir); + kfree(dfi->last_name); + kfree(dfi->dir_info); + kmem_cache_free(ceph_dir_file_cachep, dfi); + } else { + struct ceph_file_info *fi = file->private_data; + dout("release inode %p regular file %p\n", inode, file); + WARN_ON(!list_empty(&fi->rw_contexts)); + + ceph_put_fmode(ci, fi->fmode); + kmem_cache_free(ceph_file_cachep, fi); + } /* wake up anyone waiting for caps on this inode */ wake_up_all(&ci->i_cap_wq); @@ -1338,6 +1378,11 @@ retry_snap: pos = iocb->ki_pos; count = iov_iter_count(from); + if (ceph_quota_is_max_bytes_exceeded(inode, pos + count)) { + err = -EDQUOT; + goto out; + } + err = file_remove_privs(file); if (err) goto out; @@ -1419,6 +1464,7 @@ retry_snap: if (written >= 0) { int dirty; + spin_lock(&ci->i_ceph_lock); ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, @@ -1426,6 +1472,8 @@ retry_snap: spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); + if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos)) + ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL); } dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", @@ -1668,6 +1716,12 @@ static long ceph_fallocate(struct file *file, int mode, goto unlock; } + if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) && + ceph_quota_is_max_bytes_exceeded(inode, offset + length)) { + ret = -EDQUOT; + goto unlock; + } + if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) && !(mode & FALLOC_FL_PUNCH_HOLE)) { ret = -ENOSPC; @@ -1716,6 +1770,9 @@ static long ceph_fallocate(struct file *file, int mode, spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); + if ((endoff > size) && + ceph_quota_is_max_bytes_approaching(inode, endoff)) + ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL); } ceph_put_cap_refs(ci, got); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index c6ec5aa46100..ae056927080d 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -441,6 +441,9 @@ struct inode *ceph_alloc_inode(struct super_block *sb) atomic64_set(&ci->i_complete_seq[1], 0); ci->i_symlink = NULL; + ci->i_max_bytes = 0; + ci->i_max_files = 0; + memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL); @@ -536,6 +539,9 @@ void ceph_destroy_inode(struct inode *inode) ceph_queue_caps_release(inode); + if (__ceph_has_any_quota(ci)) + ceph_adjust_quota_realms_count(inode, false); + /* * we may still have a snap_realm reference if there are stray * caps in i_snap_caps. @@ -548,6 +554,9 @@ void ceph_destroy_inode(struct inode *inode) dout(" dropping residual ref to snap realm %p\n", realm); spin_lock(&realm->inodes_with_caps_lock); list_del_init(&ci->i_snap_realm_item); + ci->i_snap_realm = NULL; + if (realm->ino == ci->i_vino.ino) + realm->inode = NULL; spin_unlock(&realm->inodes_with_caps_lock); ceph_put_snap_realm(mdsc, realm); } @@ -660,13 +669,15 @@ void ceph_fill_file_time(struct inode *inode, int issued, CEPH_CAP_FILE_BUFFER| CEPH_CAP_AUTH_EXCL| CEPH_CAP_XATTR_EXCL)) { - if (timespec_compare(ctime, &inode->i_ctime) > 0) { + if (ci->i_version == 0 || + timespec_compare(ctime, &inode->i_ctime) > 0) { dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n", inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, ctime->tv_sec, ctime->tv_nsec); inode->i_ctime = *ctime; } - if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) { + if (ci->i_version == 0 || + ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) { /* the MDS did a utimes() */ dout("mtime %ld.%09ld -> %ld.%09ld " "tw %d -> %d\n", @@ -786,10 +797,11 @@ static int fill_inode(struct inode *inode, struct page *locked_page, new_issued = ~issued & le32_to_cpu(info->cap.caps); /* update inode */ - ci->i_version = le64_to_cpu(info->version); inode->i_rdev = le32_to_cpu(info->rdev); inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; + __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files); + if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) && (issued & CEPH_CAP_AUTH_EXCL) == 0) { inode->i_mode = le32_to_cpu(info->mode); @@ -857,6 +869,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page, xattr_blob = NULL; } + /* finally update i_version */ + ci->i_version = le64_to_cpu(info->version); + inode->i_mapping->a_ops = &ceph_aops; switch (inode->i_mode & S_IFMT) { @@ -1867,20 +1882,9 @@ retry: * possibly truncate them.. so write AND block! */ if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) { - struct ceph_cap_snap *capsnap; - to = ci->i_truncate_size; - list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { - // MDS should have revoked Frw caps - WARN_ON_ONCE(capsnap->writing); - if (capsnap->dirty_pages && capsnap->size > to) - to = capsnap->size; - } spin_unlock(&ci->i_ceph_lock); dout("__do_pending_vmtruncate %p flushing snaps first\n", inode); - - truncate_pagecache(inode, to); - filemap_write_and_wait_range(&inode->i_data, 0, inode->i_sb->s_maxbytes); goto retry; @@ -2152,6 +2156,10 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if (err != 0) return err; + if ((attr->ia_valid & ATTR_SIZE) && + ceph_quota_is_max_bytes_exceeded(inode, attr->ia_size)) + return -EDQUOT; + err = __ceph_setattr(inode, attr); if (err >= 0 && (attr->ia_valid & ATTR_MODE)) diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 851aa69ec8f0..c90f03beb15d 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -5,7 +5,7 @@ #include "super.h" #include "mds_client.h" #include "ioctl.h" - +#include <linux/ceph/striper.h> /* * ioctls @@ -185,7 +185,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) &ceph_sb_to_client(inode->i_sb)->client->osdc; struct ceph_object_locator oloc; CEPH_DEFINE_OID_ONSTACK(oid); - u64 len = 1, olen; + u32 xlen; u64 tmp; struct ceph_pg pgid; int r; @@ -195,13 +195,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) return -EFAULT; down_read(&osdc->lock); - r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len, - &dl.object_no, &dl.object_offset, - &olen); - if (r < 0) { - up_read(&osdc->lock); - return -EIO; - } + ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, 1, + &dl.object_no, &dl.object_offset, &xlen); dl.file_offset -= dl.object_offset; dl.object_size = ci->i_layout.object_size; dl.block_size = ci->i_layout.stripe_unit; diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 9e66f69ee8a5..9dae2ec7e1fa 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -95,7 +95,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, owner = secure_addr(fl->fl_owner); dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, " - "start: %llu, length: %llu, wait: %d, type: %d", (int)lock_type, + "start: %llu, length: %llu, wait: %d, type: %d\n", (int)lock_type, (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length, wait, fl->fl_type); @@ -132,7 +132,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, } ceph_mdsc_put_request(req); dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " - "length: %llu, wait: %d, type: %d, err code %d", (int)lock_type, + "length: %llu, wait: %d, type: %d, err code %d\n", (int)lock_type, (int)operation, (u64)fl->fl_pid, fl->fl_start, length, wait, fl->fl_type, err); return err; @@ -226,7 +226,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK) return -ENOLCK; - dout("ceph_lock, fl_owner: %p", fl->fl_owner); + dout("ceph_lock, fl_owner: %p\n", fl->fl_owner); /* set wait bit as appropriate, then make command as Ceph expects it*/ if (IS_GETLK(cmd)) @@ -264,7 +264,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl); if (!err) { if (op == CEPH_MDS_OP_SETFILELOCK) { - dout("mds locked, locking locally"); + dout("mds locked, locking locally\n"); err = posix_lock_file(file, fl, NULL); if (err) { /* undo! This should only happen if @@ -272,7 +272,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) * deadlock. */ ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, CEPH_LOCK_UNLOCK, 0, fl); - dout("got %d on posix_lock_file, undid lock", + dout("got %d on posix_lock_file, undid lock\n", err); } } @@ -294,7 +294,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) if (fl->fl_type & LOCK_MAND) return -EOPNOTSUPP; - dout("ceph_flock, fl_file: %p", fl->fl_file); + dout("ceph_flock, fl_file: %p\n", fl->fl_file); spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) { @@ -329,7 +329,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, inode, CEPH_LOCK_UNLOCK, 0, fl); - dout("got %d on locks_lock_file_wait, undid lock", err); + dout("got %d on locks_lock_file_wait, undid lock\n", err); } } return err; @@ -356,7 +356,7 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) ++(*flock_count); spin_unlock(&ctx->flc_lock); } - dout("counted %d flock locks and %d fcntl locks", + dout("counted %d flock locks and %d fcntl locks\n", *flock_count, *fcntl_count); } @@ -384,7 +384,7 @@ static int lock_to_ceph_filelock(struct file_lock *lock, cephlock->type = CEPH_LOCK_UNLOCK; break; default: - dout("Have unknown lock type %d", lock->fl_type); + dout("Have unknown lock type %d\n", lock->fl_type); err = -EINVAL; } @@ -407,7 +407,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode, int seen_flock = 0; int l = 0; - dout("encoding %d flock and %d fcntl locks", num_flock_locks, + dout("encoding %d flock and %d fcntl locks\n", num_flock_locks, num_fcntl_locks); if (!ctx) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 2e8f90f96540..5ece2e6ad154 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -100,6 +100,26 @@ static int parse_reply_info_in(void **p, void *end, } else info->inline_version = CEPH_INLINE_NONE; + if (features & CEPH_FEATURE_MDS_QUOTA) { + u8 struct_v, struct_compat; + u32 struct_len; + + /* + * both struct_v and struct_compat are expected to be >= 1 + */ + ceph_decode_8_safe(p, end, struct_v, bad); + ceph_decode_8_safe(p, end, struct_compat, bad); + if (!struct_v || !struct_compat) + goto bad; + ceph_decode_32_safe(p, end, struct_len, bad); + ceph_decode_need(p, end, struct_len, bad); + ceph_decode_64_safe(p, end, info->max_bytes, bad); + ceph_decode_64_safe(p, end, info->max_files, bad); + } else { + info->max_bytes = 0; + info->max_files = 0; + } + info->pool_ns_len = 0; info->pool_ns_data = NULL; if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) { @@ -384,7 +404,7 @@ static struct ceph_mds_session *get_session(struct ceph_mds_session *s) refcount_read(&s->s_ref)-1, refcount_read(&s->s_ref)); return s; } else { - dout("mdsc get_session %p 0 -- FAIL", s); + dout("mdsc get_session %p 0 -- FAIL\n", s); return NULL; } } @@ -419,9 +439,10 @@ struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc, static bool __have_session(struct ceph_mds_client *mdsc, int mds) { - if (mds >= mdsc->max_sessions) + if (mds >= mdsc->max_sessions || !mdsc->sessions[mds]) return false; - return mdsc->sessions[mds]; + else + return true; } static int __verify_registered_session(struct ceph_mds_client *mdsc, @@ -448,6 +469,25 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, s = kzalloc(sizeof(*s), GFP_NOFS); if (!s) return ERR_PTR(-ENOMEM); + + if (mds >= mdsc->max_sessions) { + int newmax = 1 << get_count_order(mds + 1); + struct ceph_mds_session **sa; + + dout("%s: realloc to %d\n", __func__, newmax); + sa = kcalloc(newmax, sizeof(void *), GFP_NOFS); + if (!sa) + goto fail_realloc; + if (mdsc->sessions) { + memcpy(sa, mdsc->sessions, + mdsc->max_sessions * sizeof(void *)); + kfree(mdsc->sessions); + } + mdsc->sessions = sa; + mdsc->max_sessions = newmax; + } + + dout("%s: mds%d\n", __func__, mds); s->s_mdsc = mdsc; s->s_mds = mds; s->s_state = CEPH_MDS_SESSION_NEW; @@ -476,23 +516,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, INIT_LIST_HEAD(&s->s_cap_releases); INIT_LIST_HEAD(&s->s_cap_flushing); - dout("register_session mds%d\n", mds); - if (mds >= mdsc->max_sessions) { - int newmax = 1 << get_count_order(mds+1); - struct ceph_mds_session **sa; - - dout("register_session realloc to %d\n", newmax); - sa = kcalloc(newmax, sizeof(void *), GFP_NOFS); - if (!sa) - goto fail_realloc; - if (mdsc->sessions) { - memcpy(sa, mdsc->sessions, - mdsc->max_sessions * sizeof(void *)); - kfree(mdsc->sessions); - } - mdsc->sessions = sa; - mdsc->max_sessions = newmax; - } mdsc->sessions[mds] = s; atomic_inc(&mdsc->num_sessions); refcount_inc(&s->s_ref); /* one ref to sessions[], one to caller */ @@ -2531,10 +2554,10 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) * Otherwise we just have to return an ESTALE */ if (result == -ESTALE) { - dout("got ESTALE on request %llu", req->r_tid); + dout("got ESTALE on request %llu\n", req->r_tid); req->r_resend_mds = -1; if (req->r_direct_mode != USE_AUTH_MDS) { - dout("not using auth, setting for that now"); + dout("not using auth, setting for that now\n"); req->r_direct_mode = USE_AUTH_MDS; __do_request(mdsc, req); mutex_unlock(&mdsc->mutex); @@ -2542,13 +2565,13 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) } else { int mds = __choose_mds(mdsc, req); if (mds >= 0 && mds != req->r_session->s_mds) { - dout("but auth changed, so resending"); + dout("but auth changed, so resending\n"); __do_request(mdsc, req); mutex_unlock(&mdsc->mutex); goto out; } } - dout("have to return ESTALE on request %llu", req->r_tid); + dout("have to return ESTALE on request %llu\n", req->r_tid); } @@ -3470,13 +3493,12 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, } /* - * drop all leases (and dentry refs) in preparation for umount + * lock unlock sessions, to wait ongoing session activities */ -static void drop_leases(struct ceph_mds_client *mdsc) +static void lock_unlock_sessions(struct ceph_mds_client *mdsc) { int i; - dout("drop_leases\n"); mutex_lock(&mdsc->mutex); for (i = 0; i < mdsc->max_sessions; i++) { struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); @@ -3572,7 +3594,6 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) if (!mdsc) return -ENOMEM; mdsc->fsc = fsc; - fsc->mdsc = mdsc; mutex_init(&mdsc->mutex); mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); if (!mdsc->mdsmap) { @@ -3580,6 +3601,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) return -ENOMEM; } + fsc->mdsc = mdsc; init_completion(&mdsc->safe_umount_waiters); init_waitqueue_head(&mdsc->session_close_wq); INIT_LIST_HEAD(&mdsc->waiting_for_map); @@ -3587,6 +3609,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) atomic_set(&mdsc->num_sessions, 0); mdsc->max_sessions = 0; mdsc->stopping = 0; + atomic64_set(&mdsc->quotarealms_count, 0); mdsc->last_snap_seq = 0; init_rwsem(&mdsc->snap_rwsem); mdsc->snap_realms = RB_ROOT; @@ -3660,7 +3683,7 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) dout("pre_umount\n"); mdsc->stopping = 1; - drop_leases(mdsc); + lock_unlock_sessions(mdsc); ceph_flush_dirty_caps(mdsc); wait_requests(mdsc); @@ -3858,6 +3881,9 @@ void ceph_mdsc_destroy(struct ceph_fs_client *fsc) struct ceph_mds_client *mdsc = fsc->mdsc; dout("mdsc_destroy %p\n", mdsc); + if (!mdsc) + return; + /* flush out any connection work with references to us */ ceph_msgr_flush(); @@ -4077,6 +4103,9 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) case CEPH_MSG_CLIENT_LEASE: handle_lease(mdsc, s, msg); break; + case CEPH_MSG_CLIENT_QUOTA: + ceph_handle_quota(mdsc, s, msg); + break; default: pr_err("received unknown message type %d %s\n", type, diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 71e3b783ee6f..2ec3b5b35067 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -49,6 +49,8 @@ struct ceph_mds_reply_info_in { char *inline_data; u32 pool_ns_len; char *pool_ns_data; + u64 max_bytes; + u64 max_files; }; struct ceph_mds_reply_dir_entry { @@ -312,6 +314,8 @@ struct ceph_mds_client { int max_sessions; /* len of s_mds_sessions */ int stopping; /* true if shutting down */ + atomic64_t quotarealms_count; /* # realms with quota */ + /* * snap_rwsem will cover cap linkage into snaprealms, and * realm snap contexts. (later, we can do per-realm snap diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c new file mode 100644 index 000000000000..242bfa5c0539 --- /dev/null +++ b/fs/ceph/quota.c @@ -0,0 +1,361 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * quota.c - CephFS quota + * + * Copyright (C) 2017-2018 SUSE + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/statfs.h> + +#include "super.h" +#include "mds_client.h" + +void ceph_adjust_quota_realms_count(struct inode *inode, bool inc) +{ + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + if (inc) + atomic64_inc(&mdsc->quotarealms_count); + else + atomic64_dec(&mdsc->quotarealms_count); +} + +static inline bool ceph_has_realms_with_quotas(struct inode *inode) +{ + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + return atomic64_read(&mdsc->quotarealms_count) > 0; +} + +void ceph_handle_quota(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_msg *msg) +{ + struct super_block *sb = mdsc->fsc->sb; + struct ceph_mds_quota *h = msg->front.iov_base; + struct ceph_vino vino; + struct inode *inode; + struct ceph_inode_info *ci; + + if (msg->front.iov_len != sizeof(*h)) { + pr_err("%s corrupt message mds%d len %d\n", __func__, + session->s_mds, (int)msg->front.iov_len); + ceph_msg_dump(msg); + return; + } + + /* increment msg sequence number */ + mutex_lock(&session->s_mutex); + session->s_seq++; + mutex_unlock(&session->s_mutex); + + /* lookup inode */ + vino.ino = le64_to_cpu(h->ino); + vino.snap = CEPH_NOSNAP; + inode = ceph_find_inode(sb, vino); + if (!inode) { + pr_warn("Failed to find inode %llu\n", vino.ino); + return; + } + ci = ceph_inode(inode); + + spin_lock(&ci->i_ceph_lock); + ci->i_rbytes = le64_to_cpu(h->rbytes); + ci->i_rfiles = le64_to_cpu(h->rfiles); + ci->i_rsubdirs = le64_to_cpu(h->rsubdirs); + __ceph_update_quota(ci, le64_to_cpu(h->max_bytes), + le64_to_cpu(h->max_files)); + spin_unlock(&ci->i_ceph_lock); + + iput(inode); +} + +/* + * This function walks through the snaprealm for an inode and returns the + * ceph_snap_realm for the first snaprealm that has quotas set (either max_files + * or max_bytes). If the root is reached, return the root ceph_snap_realm + * instead. + * + * Note that the caller is responsible for calling ceph_put_snap_realm() on the + * returned realm. + */ +static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc, + struct inode *inode) +{ + struct ceph_inode_info *ci = NULL; + struct ceph_snap_realm *realm, *next; + struct inode *in; + bool has_quota; + + if (ceph_snap(inode) != CEPH_NOSNAP) + return NULL; + + realm = ceph_inode(inode)->i_snap_realm; + if (realm) + ceph_get_snap_realm(mdsc, realm); + else + pr_err_ratelimited("get_quota_realm: ino (%llx.%llx) " + "null i_snap_realm\n", ceph_vinop(inode)); + while (realm) { + spin_lock(&realm->inodes_with_caps_lock); + in = realm->inode ? igrab(realm->inode) : NULL; + spin_unlock(&realm->inodes_with_caps_lock); + if (!in) + break; + + ci = ceph_inode(in); + has_quota = __ceph_has_any_quota(ci); + iput(in); + + next = realm->parent; + if (has_quota || !next) + return realm; + + ceph_get_snap_realm(mdsc, next); + ceph_put_snap_realm(mdsc, realm); + realm = next; + } + if (realm) + ceph_put_snap_realm(mdsc, realm); + + return NULL; +} + +bool ceph_quota_is_same_realm(struct inode *old, struct inode *new) +{ + struct ceph_mds_client *mdsc = ceph_inode_to_client(old)->mdsc; + struct ceph_snap_realm *old_realm, *new_realm; + bool is_same; + + down_read(&mdsc->snap_rwsem); + old_realm = get_quota_realm(mdsc, old); + new_realm = get_quota_realm(mdsc, new); + is_same = (old_realm == new_realm); + up_read(&mdsc->snap_rwsem); + + if (old_realm) + ceph_put_snap_realm(mdsc, old_realm); + if (new_realm) + ceph_put_snap_realm(mdsc, new_realm); + + return is_same; +} + +enum quota_check_op { + QUOTA_CHECK_MAX_FILES_OP, /* check quota max_files limit */ + QUOTA_CHECK_MAX_BYTES_OP, /* check quota max_files limit */ + QUOTA_CHECK_MAX_BYTES_APPROACHING_OP /* check if quota max_files + limit is approaching */ +}; + +/* + * check_quota_exceeded() will walk up the snaprealm hierarchy and, for each + * realm, it will execute quota check operation defined by the 'op' parameter. + * The snaprealm walk is interrupted if the quota check detects that the quota + * is exceeded or if the root inode is reached. + */ +static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op, + loff_t delta) +{ + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_inode_info *ci; + struct ceph_snap_realm *realm, *next; + struct inode *in; + u64 max, rvalue; + bool exceeded = false; + + if (ceph_snap(inode) != CEPH_NOSNAP) + return false; + + down_read(&mdsc->snap_rwsem); + realm = ceph_inode(inode)->i_snap_realm; + if (realm) + ceph_get_snap_realm(mdsc, realm); + else + pr_err_ratelimited("check_quota_exceeded: ino (%llx.%llx) " + "null i_snap_realm\n", ceph_vinop(inode)); + while (realm) { + spin_lock(&realm->inodes_with_caps_lock); + in = realm->inode ? igrab(realm->inode) : NULL; + spin_unlock(&realm->inodes_with_caps_lock); + if (!in) + break; + + ci = ceph_inode(in); + spin_lock(&ci->i_ceph_lock); + if (op == QUOTA_CHECK_MAX_FILES_OP) { + max = ci->i_max_files; + rvalue = ci->i_rfiles + ci->i_rsubdirs; + } else { + max = ci->i_max_bytes; + rvalue = ci->i_rbytes; + } + spin_unlock(&ci->i_ceph_lock); + switch (op) { + case QUOTA_CHECK_MAX_FILES_OP: + exceeded = (max && (rvalue >= max)); + break; + case QUOTA_CHECK_MAX_BYTES_OP: + exceeded = (max && (rvalue + delta > max)); + break; + case QUOTA_CHECK_MAX_BYTES_APPROACHING_OP: + if (max) { + if (rvalue >= max) + exceeded = true; + else { + /* + * when we're writing more that 1/16th + * of the available space + */ + exceeded = + (((max - rvalue) >> 4) < delta); + } + } + break; + default: + /* Shouldn't happen */ + pr_warn("Invalid quota check op (%d)\n", op); + exceeded = true; /* Just break the loop */ + } + iput(in); + + next = realm->parent; + if (exceeded || !next) + break; + ceph_get_snap_realm(mdsc, next); + ceph_put_snap_realm(mdsc, realm); + realm = next; + } + ceph_put_snap_realm(mdsc, realm); + up_read(&mdsc->snap_rwsem); + + return exceeded; +} + +/* + * ceph_quota_is_max_files_exceeded - check if we can create a new file + * @inode: directory where a new file is being created + * + * This functions returns true is max_files quota allows a new file to be + * created. It is necessary to walk through the snaprealm hierarchy (until the + * FS root) to check all realms with quotas set. + */ +bool ceph_quota_is_max_files_exceeded(struct inode *inode) +{ + if (!ceph_has_realms_with_quotas(inode)) + return false; + + WARN_ON(!S_ISDIR(inode->i_mode)); + + return check_quota_exceeded(inode, QUOTA_CHECK_MAX_FILES_OP, 0); +} + +/* + * ceph_quota_is_max_bytes_exceeded - check if we can write to a file + * @inode: inode being written + * @newsize: new size if write succeeds + * + * This functions returns true is max_bytes quota allows a file size to reach + * @newsize; it returns false otherwise. + */ +bool ceph_quota_is_max_bytes_exceeded(struct inode *inode, loff_t newsize) +{ + loff_t size = i_size_read(inode); + + if (!ceph_has_realms_with_quotas(inode)) + return false; + + /* return immediately if we're decreasing file size */ + if (newsize <= size) + return false; + + return check_quota_exceeded(inode, QUOTA_CHECK_MAX_BYTES_OP, (newsize - size)); +} + +/* + * ceph_quota_is_max_bytes_approaching - check if we're reaching max_bytes + * @inode: inode being written + * @newsize: new size if write succeeds + * + * This function returns true if the new file size @newsize will be consuming + * more than 1/16th of the available quota space; it returns false otherwise. + */ +bool ceph_quota_is_max_bytes_approaching(struct inode *inode, loff_t newsize) +{ + loff_t size = ceph_inode(inode)->i_reported_size; + + if (!ceph_has_realms_with_quotas(inode)) + return false; + + /* return immediately if we're decreasing file size */ + if (newsize <= size) + return false; + + return check_quota_exceeded(inode, QUOTA_CHECK_MAX_BYTES_APPROACHING_OP, + (newsize - size)); +} + +/* + * ceph_quota_update_statfs - if root has quota update statfs with quota status + * @fsc: filesystem client instance + * @buf: statfs to update + * + * If the mounted filesystem root has max_bytes quota set, update the filesystem + * statistics with the quota status. + * + * This function returns true if the stats have been updated, false otherwise. + */ +bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf) +{ + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_inode_info *ci; + struct ceph_snap_realm *realm; + struct inode *in; + u64 total = 0, used, free; + bool is_updated = false; + + down_read(&mdsc->snap_rwsem); + realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root)); + up_read(&mdsc->snap_rwsem); + if (!realm) + return false; + + spin_lock(&realm->inodes_with_caps_lock); + in = realm->inode ? igrab(realm->inode) : NULL; + spin_unlock(&realm->inodes_with_caps_lock); + if (in) { + ci = ceph_inode(in); + spin_lock(&ci->i_ceph_lock); + if (ci->i_max_bytes) { + total = ci->i_max_bytes >> CEPH_BLOCK_SHIFT; + used = ci->i_rbytes >> CEPH_BLOCK_SHIFT; + /* It is possible for a quota to be exceeded. + * Report 'zero' in that case + */ + free = total > used ? total - used : 0; + } + spin_unlock(&ci->i_ceph_lock); + if (total) { + buf->f_blocks = total; + buf->f_bfree = free; + buf->f_bavail = free; + is_updated = true; + } + iput(in); + } + ceph_put_snap_realm(mdsc, realm); + + return is_updated; +} + diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 07cf95e6413d..041c27ea8de1 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -931,6 +931,8 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, list_add(&ci->i_snap_realm_item, &realm->inodes_with_caps); ci->i_snap_realm = realm; + if (realm->ino == ci->i_vino.ino) + realm->inode = inode; spin_unlock(&realm->inodes_with_caps_lock); spin_unlock(&ci->i_ceph_lock); diff --git a/fs/ceph/super.c b/fs/ceph/super.c index fb2bc9c15a23..b33082e6878f 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -76,9 +76,18 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) */ buf->f_bsize = 1 << CEPH_BLOCK_SHIFT; buf->f_frsize = 1 << CEPH_BLOCK_SHIFT; - buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10); - buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); - buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); + + /* + * By default use root quota for stats; fallback to overall filesystem + * usage if using 'noquotadf' mount option or if the root dir doesn't + * have max_bytes quota set. + */ + if (ceph_test_mount_opt(fsc, NOQUOTADF) || + !ceph_quota_update_statfs(fsc, buf)) { + buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10); + buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); + buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); + } buf->f_files = le64_to_cpu(st.num_objects); buf->f_ffree = -1; @@ -151,6 +160,8 @@ enum { Opt_acl, #endif Opt_noacl, + Opt_quotadf, + Opt_noquotadf, }; static match_table_t fsopt_tokens = { @@ -187,6 +198,8 @@ static match_table_t fsopt_tokens = { {Opt_acl, "acl"}, #endif {Opt_noacl, "noacl"}, + {Opt_quotadf, "quotadf"}, + {Opt_noquotadf, "noquotadf"}, {-1, NULL} }; @@ -314,13 +327,16 @@ static int parse_fsopt_token(char *c, void *private) break; case Opt_fscache: fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE; + kfree(fsopt->fscache_uniq); + fsopt->fscache_uniq = NULL; break; case Opt_nofscache: fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE; + kfree(fsopt->fscache_uniq); + fsopt->fscache_uniq = NULL; break; case Opt_poolperm: fsopt->flags &= ~CEPH_MOUNT_OPT_NOPOOLPERM; - printk ("pool perm"); break; case Opt_nopoolperm: fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM; @@ -331,6 +347,12 @@ static int parse_fsopt_token(char *c, void *private) case Opt_norequire_active_mds: fsopt->flags |= CEPH_MOUNT_OPT_MOUNTWAIT; break; + case Opt_quotadf: + fsopt->flags &= ~CEPH_MOUNT_OPT_NOQUOTADF; + break; + case Opt_noquotadf: + fsopt->flags |= CEPH_MOUNT_OPT_NOQUOTADF; + break; #ifdef CONFIG_CEPH_FS_POSIX_ACL case Opt_acl: fsopt->sb_flags |= SB_POSIXACL; @@ -513,13 +535,12 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0) seq_puts(m, ",nodcache"); if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) { - if (fsopt->fscache_uniq) - seq_printf(m, ",fsc=%s", fsopt->fscache_uniq); - else - seq_puts(m, ",fsc"); + seq_show_option(m, "fsc", fsopt->fscache_uniq); } if (fsopt->flags & CEPH_MOUNT_OPT_NOPOOLPERM) seq_puts(m, ",nopoolperm"); + if (fsopt->flags & CEPH_MOUNT_OPT_NOQUOTADF) + seq_puts(m, ",noquotadf"); #ifdef CONFIG_CEPH_FS_POSIX_ACL if (fsopt->sb_flags & SB_POSIXACL) @@ -529,7 +550,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) #endif if (fsopt->mds_namespace) - seq_printf(m, ",mds_namespace=%s", fsopt->mds_namespace); + seq_show_option(m, "mds_namespace", fsopt->mds_namespace); if (fsopt->wsize) seq_printf(m, ",wsize=%d", fsopt->wsize); if (fsopt->rsize != CEPH_MAX_READ_SIZE) @@ -679,6 +700,7 @@ struct kmem_cache *ceph_cap_cachep; struct kmem_cache *ceph_cap_flush_cachep; struct kmem_cache *ceph_dentry_cachep; struct kmem_cache *ceph_file_cachep; +struct kmem_cache *ceph_dir_file_cachep; static void ceph_inode_init_once(void *foo) { @@ -698,8 +720,7 @@ static int __init init_caches(void) if (!ceph_inode_cachep) return -ENOMEM; - ceph_cap_cachep = KMEM_CACHE(ceph_cap, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); + ceph_cap_cachep = KMEM_CACHE(ceph_cap, SLAB_MEM_SPREAD); if (!ceph_cap_cachep) goto bad_cap; ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush, @@ -716,6 +737,10 @@ static int __init init_caches(void) if (!ceph_file_cachep) goto bad_file; + ceph_dir_file_cachep = KMEM_CACHE(ceph_dir_file_info, SLAB_MEM_SPREAD); + if (!ceph_dir_file_cachep) + goto bad_dir_file; + error = ceph_fscache_register(); if (error) goto bad_fscache; @@ -723,6 +748,8 @@ static int __init init_caches(void) return 0; bad_fscache: + kmem_cache_destroy(ceph_dir_file_cachep); +bad_dir_file: kmem_cache_destroy(ceph_file_cachep); bad_file: kmem_cache_destroy(ceph_dentry_cachep); @@ -748,6 +775,7 @@ static void destroy_caches(void) kmem_cache_destroy(ceph_cap_flush_cachep); kmem_cache_destroy(ceph_dentry_cachep); kmem_cache_destroy(ceph_file_cachep); + kmem_cache_destroy(ceph_dir_file_cachep); ceph_fscache_unregister(); } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 1c2086e0fec2..a7077a0c989f 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -39,6 +39,7 @@ #define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */ #define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */ #define CEPH_MOUNT_OPT_MOUNTWAIT (1<<12) /* mount waits if no mds is up */ +#define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */ #define CEPH_MOUNT_OPT_DEFAULT CEPH_MOUNT_OPT_DCACHE @@ -310,6 +311,9 @@ struct ceph_inode_info { u64 i_rbytes, i_rfiles, i_rsubdirs; u64 i_files, i_subdirs; + /* quotas */ + u64 i_max_bytes, i_max_files; + struct rb_root i_fragtree; int i_fragtree_nsplits; struct mutex i_fragtree_mutex; @@ -671,6 +675,10 @@ struct ceph_file_info { spinlock_t rw_contexts_lock; struct list_head rw_contexts; +}; + +struct ceph_dir_file_info { + struct ceph_file_info file_info; /* readdir: position within the dir */ u32 frag; @@ -748,6 +756,7 @@ struct ceph_readdir_cache_control { */ struct ceph_snap_realm { u64 ino; + struct inode *inode; atomic_t nref; struct rb_node node; @@ -1066,4 +1075,37 @@ extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks, extern int ceph_fs_debugfs_init(struct ceph_fs_client *client); extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client); +/* quota.c */ +static inline bool __ceph_has_any_quota(struct ceph_inode_info *ci) +{ + return ci->i_max_files || ci->i_max_bytes; +} + +extern void ceph_adjust_quota_realms_count(struct inode *inode, bool inc); + +static inline void __ceph_update_quota(struct ceph_inode_info *ci, + u64 max_bytes, u64 max_files) +{ + bool had_quota, has_quota; + had_quota = __ceph_has_any_quota(ci); + ci->i_max_bytes = max_bytes; + ci->i_max_files = max_files; + has_quota = __ceph_has_any_quota(ci); + + if (had_quota != has_quota) + ceph_adjust_quota_realms_count(&ci->vfs_inode, has_quota); +} + +extern void ceph_handle_quota(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_msg *msg); +extern bool ceph_quota_is_max_files_exceeded(struct inode *inode); +extern bool ceph_quota_is_same_realm(struct inode *old, struct inode *new); +extern bool ceph_quota_is_max_bytes_exceeded(struct inode *inode, + loff_t newlen); +extern bool ceph_quota_is_max_bytes_approaching(struct inode *inode, + loff_t newlen); +extern bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, + struct kstatfs *buf); + #endif /* _FS_CEPH_SUPER_H */ diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index e1c4e0b12b4c..7e72348639e4 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -224,6 +224,31 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val, (long)ci->i_rctime.tv_nsec); } +/* quotas */ + +static bool ceph_vxattrcb_quota_exists(struct ceph_inode_info *ci) +{ + return (ci->i_max_files || ci->i_max_bytes); +} + +static size_t ceph_vxattrcb_quota(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return snprintf(val, size, "max_bytes=%llu max_files=%llu", + ci->i_max_bytes, ci->i_max_files); +} + +static size_t ceph_vxattrcb_quota_max_bytes(struct ceph_inode_info *ci, + char *val, size_t size) +{ + return snprintf(val, size, "%llu", ci->i_max_bytes); +} + +static size_t ceph_vxattrcb_quota_max_files(struct ceph_inode_info *ci, + char *val, size_t size) +{ + return snprintf(val, size, "%llu", ci->i_max_files); +} #define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name #define CEPH_XATTR_NAME2(_type, _name, _name2) \ @@ -247,6 +272,15 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val, .hidden = true, \ .exists_cb = ceph_vxattrcb_layout_exists, \ } +#define XATTR_QUOTA_FIELD(_type, _name) \ + { \ + .name = CEPH_XATTR_NAME(_type, _name), \ + .name_size = sizeof(CEPH_XATTR_NAME(_type, _name)), \ + .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ + .readonly = false, \ + .hidden = true, \ + .exists_cb = ceph_vxattrcb_quota_exists, \ + } static struct ceph_vxattr ceph_dir_vxattrs[] = { { @@ -270,6 +304,16 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = { XATTR_NAME_CEPH(dir, rsubdirs), XATTR_NAME_CEPH(dir, rbytes), XATTR_NAME_CEPH(dir, rctime), + { + .name = "ceph.quota", + .name_size = sizeof("ceph.quota"), + .getxattr_cb = ceph_vxattrcb_quota, + .readonly = false, + .hidden = true, + .exists_cb = ceph_vxattrcb_quota_exists, + }, + XATTR_QUOTA_FIELD(quota, max_bytes), + XATTR_QUOTA_FIELD(quota, max_files), { .name = NULL, 0 } /* Required table terminator */ }; static size_t ceph_dir_vxattrs_name_size; /* total size of all names */ diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c index 2c14020e5e1d..edf5f40898bf 100644 --- a/fs/cifs/cache.c +++ b/fs/cifs/cache.c @@ -46,67 +46,11 @@ void cifs_fscache_unregister(void) } /* - * Key layout of CIFS server cache index object - */ -struct cifs_server_key { - uint16_t family; /* address family */ - __be16 port; /* IP port */ - union { - struct in_addr ipv4_addr; - struct in6_addr ipv6_addr; - } addr[0]; -}; - -/* - * Server object keyed by {IPaddress,port,family} tuple - */ -static uint16_t cifs_server_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t maxbuf) -{ - const struct TCP_Server_Info *server = cookie_netfs_data; - const struct sockaddr *sa = (struct sockaddr *) &server->dstaddr; - const struct sockaddr_in *addr = (struct sockaddr_in *) sa; - const struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) sa; - struct cifs_server_key *key = buffer; - uint16_t key_len = sizeof(struct cifs_server_key); - - memset(key, 0, key_len); - - /* - * Should not be a problem as sin_family/sin6_family overlays - * sa_family field - */ - switch (sa->sa_family) { - case AF_INET: - key->family = sa->sa_family; - key->port = addr->sin_port; - key->addr[0].ipv4_addr = addr->sin_addr; - key_len += sizeof(key->addr[0].ipv4_addr); - break; - - case AF_INET6: - key->family = sa->sa_family; - key->port = addr6->sin6_port; - key->addr[0].ipv6_addr = addr6->sin6_addr; - key_len += sizeof(key->addr[0].ipv6_addr); - break; - - default: - cifs_dbg(VFS, "Unknown network family '%d'\n", sa->sa_family); - key_len = 0; - break; - } - - return key_len; -} - -/* * Server object for FS-Cache */ const struct fscache_cookie_def cifs_fscache_server_index_def = { .name = "CIFS.server", .type = FSCACHE_COOKIE_TYPE_INDEX, - .get_key = cifs_server_get_key, }; /* @@ -116,7 +60,7 @@ struct cifs_fscache_super_auxdata { u64 resource_id; /* unique server resource id */ }; -static char *extract_sharename(const char *treename) +char *extract_sharename(const char *treename) { const char *src; char *delim, *dst; @@ -140,56 +84,11 @@ static char *extract_sharename(const char *treename) return dst; } -/* - * Superblock object currently keyed by share name - */ -static uint16_t cifs_super_get_key(const void *cookie_netfs_data, void *buffer, - uint16_t maxbuf) -{ - const struct cifs_tcon *tcon = cookie_netfs_data; - char *sharename; - uint16_t len; - - sharename = extract_sharename(tcon->treeName); - if (IS_ERR(sharename)) { - cifs_dbg(FYI, "%s: couldn't extract sharename\n", __func__); - sharename = NULL; - return 0; - } - - len = strlen(sharename); - if (len > maxbuf) - return 0; - - memcpy(buffer, sharename, len); - - kfree(sharename); - - return len; -} - -static uint16_t -cifs_fscache_super_get_aux(const void *cookie_netfs_data, void *buffer, - uint16_t maxbuf) -{ - struct cifs_fscache_super_auxdata auxdata; - const struct cifs_tcon *tcon = cookie_netfs_data; - - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.resource_id = tcon->resource_id; - - if (maxbuf > sizeof(auxdata)) - maxbuf = sizeof(auxdata); - - memcpy(buffer, &auxdata, maxbuf); - - return maxbuf; -} - static enum fscache_checkaux cifs_fscache_super_check_aux(void *cookie_netfs_data, const void *data, - uint16_t datalen) + uint16_t datalen, + loff_t object_size) { struct cifs_fscache_super_auxdata auxdata; const struct cifs_tcon *tcon = cookie_netfs_data; @@ -212,68 +111,14 @@ fscache_checkaux cifs_fscache_super_check_aux(void *cookie_netfs_data, const struct fscache_cookie_def cifs_fscache_super_index_def = { .name = "CIFS.super", .type = FSCACHE_COOKIE_TYPE_INDEX, - .get_key = cifs_super_get_key, - .get_aux = cifs_fscache_super_get_aux, .check_aux = cifs_fscache_super_check_aux, }; -/* - * Auxiliary data attached to CIFS inode within the cache - */ -struct cifs_fscache_inode_auxdata { - struct timespec last_write_time; - struct timespec last_change_time; - u64 eof; -}; - -static uint16_t cifs_fscache_inode_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t maxbuf) -{ - const struct cifsInodeInfo *cifsi = cookie_netfs_data; - uint16_t keylen; - - /* use the UniqueId as the key */ - keylen = sizeof(cifsi->uniqueid); - if (keylen > maxbuf) - keylen = 0; - else - memcpy(buffer, &cifsi->uniqueid, keylen); - - return keylen; -} - -static void -cifs_fscache_inode_get_attr(const void *cookie_netfs_data, uint64_t *size) -{ - const struct cifsInodeInfo *cifsi = cookie_netfs_data; - - *size = cifsi->vfs_inode.i_size; -} - -static uint16_t -cifs_fscache_inode_get_aux(const void *cookie_netfs_data, void *buffer, - uint16_t maxbuf) -{ - struct cifs_fscache_inode_auxdata auxdata; - const struct cifsInodeInfo *cifsi = cookie_netfs_data; - - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.eof = cifsi->server_eof; - auxdata.last_write_time = cifsi->vfs_inode.i_mtime; - auxdata.last_change_time = cifsi->vfs_inode.i_ctime; - - if (maxbuf > sizeof(auxdata)) - maxbuf = sizeof(auxdata); - - memcpy(buffer, &auxdata, maxbuf); - - return maxbuf; -} - static enum fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data, const void *data, - uint16_t datalen) + uint16_t datalen, + loff_t object_size) { struct cifs_fscache_inode_auxdata auxdata; struct cifsInodeInfo *cifsi = cookie_netfs_data; @@ -295,8 +140,5 @@ fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data, const struct fscache_cookie_def cifs_fscache_inode_object_def = { .name = "CIFS.uniqueid", .type = FSCACHE_COOKIE_TYPE_DATAFILE, - .get_key = cifs_fscache_inode_get_key, - .get_attr = cifs_fscache_inode_get_attr, - .get_aux = cifs_fscache_inode_get_aux, .check_aux = cifs_fscache_inode_check_aux, }; diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index e35e711db68e..9d69ea433330 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -42,23 +42,6 @@ cifs_dump_mem(char *label, void *data, int length) data, length, true); } -#ifdef CONFIG_CIFS_DEBUG -void cifs_vfs_err(const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - - pr_err_ratelimited("CIFS VFS: %pV", &vaf); - - va_end(args); -} -#endif - void cifs_dump_detail(void *buf) { #ifdef CONFIG_CIFS_DEBUG2 diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h index c611ca2339d7..fe5567655662 100644 --- a/fs/cifs/cifs_debug.h +++ b/fs/cifs/cifs_debug.h @@ -39,6 +39,7 @@ extern int cifsFYI; #else #define NOISY 0 #endif +#define ONCE 8 /* * debug ON @@ -46,19 +47,28 @@ extern int cifsFYI; */ #ifdef CONFIG_CIFS_DEBUG -__printf(1, 2) void cifs_vfs_err(const char *fmt, ...); - /* information message: e.g., configuration, major event */ -#define cifs_dbg(type, fmt, ...) \ -do { \ - if (type == FYI && cifsFYI & CIFS_INFO) { \ - pr_debug_ratelimited("%s: " \ - fmt, __FILE__, ##__VA_ARGS__); \ - } else if (type == VFS) { \ - cifs_vfs_err(fmt, ##__VA_ARGS__); \ - } else if (type == NOISY && type != 0) { \ - pr_debug_ratelimited(fmt, ##__VA_ARGS__); \ - } \ +#define cifs_dbg_func(ratefunc, type, fmt, ...) \ +do { \ + if ((type) & FYI && cifsFYI & CIFS_INFO) { \ + pr_debug_ ## ratefunc("%s: " \ + fmt, __FILE__, ##__VA_ARGS__); \ + } else if ((type) & VFS) { \ + pr_err_ ## ratefunc("CuIFS VFS: " \ + fmt, ##__VA_ARGS__); \ + } else if ((type) & NOISY && (NOISY != 0)) { \ + pr_debug_ ## ratefunc(fmt, ##__VA_ARGS__); \ + } \ +} while (0) + +#define cifs_dbg(type, fmt, ...) \ +do { \ + if ((type) & ONCE) \ + cifs_dbg_func(once, \ + type, fmt, ##__VA_ARGS__); \ + else \ + cifs_dbg_func(ratelimited, \ + type, fmt, ##__VA_ARGS__); \ } while (0) /* diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 2282562e78a1..cb950a5fa078 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -665,6 +665,8 @@ struct TCP_Server_Info { struct delayed_work echo; /* echo ping workqueue job */ char *smallbuf; /* pointer to current "small" buffer */ char *bigbuf; /* pointer to current "big" buffer */ + /* Total size of this PDU. Only valid from cifs_demultiplex_thread */ + unsigned int pdu_size; unsigned int total_read; /* total amount of data read in this pass */ #ifdef CONFIG_CIFS_FSCACHE struct fscache_cookie *fscache; /* client index cache cookie */ @@ -676,6 +678,7 @@ struct TCP_Server_Info { unsigned int max_read; unsigned int max_write; #ifdef CONFIG_CIFS_SMB311 + __le16 cipher_type; /* save initital negprot hash */ __u8 preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE]; #endif /* 3.1.1 */ @@ -1373,6 +1376,7 @@ struct mid_q_entry { mid_handle_t *handle; /* call handle mid callback */ void *callback_data; /* general purpose pointer for callback */ void *resp_buf; /* pointer to received SMB header */ + unsigned int resp_buf_size; int mid_state; /* wish this were enum but can not pass to wait_event */ unsigned int mid_flags; __le16 command; /* smb command code */ diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 59c09a596c0a..6d3e40d7029c 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -206,8 +206,10 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) mutex_unlock(&ses->session_mutex); cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc); - if (rc) + if (rc) { + printk_once(KERN_WARNING "reconnect tcon failed rc = %d\n", rc); goto out; + } atomic_inc(&tconInfoReconnectCount); @@ -1416,8 +1418,9 @@ openRetry: int cifs_discard_remaining_data(struct TCP_Server_Info *server) { - unsigned int rfclen = get_rfc1002_length(server->smallbuf); - int remaining = rfclen + 4 - server->total_read; + unsigned int rfclen = server->pdu_size; + int remaining = rfclen + server->vals->header_preamble_size - + server->total_read; while (remaining > 0) { int length; @@ -1454,7 +1457,7 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) unsigned int data_offset, data_len; struct cifs_readdata *rdata = mid->callback_data; char *buf = server->smallbuf; - unsigned int buflen = get_rfc1002_length(buf) + + unsigned int buflen = server->pdu_size + server->vals->header_preamble_size; bool use_rdma_mr = false; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 4e0808f40195..e8830f076a7f 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -772,7 +772,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid) { int length; char *buf = server->smallbuf; - unsigned int pdu_length = get_rfc1002_length(buf); + unsigned int pdu_length = server->pdu_size; /* make sure this will fit in a large buffer */ if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server) - @@ -881,6 +881,7 @@ cifs_demultiplex_thread(void *p) * so we can now interpret the length field. */ pdu_length = get_rfc1002_length(buf); + server->pdu_size = pdu_length; cifs_dbg(FYI, "RFC1002 header 0x%x\n", pdu_length); if (!is_smb_response(server, buf[0])) @@ -927,6 +928,7 @@ cifs_demultiplex_thread(void *p) server->lstrp = jiffies; if (mid_entry != NULL) { + mid_entry->resp_buf_size = server->pdu_size; if ((mid_entry->mid_flags & MID_WAIT_CANCELLED) && mid_entry->mid_state == MID_RESPONSE_RECEIVED && server->ops->handle_cancelled_mid) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 7cee97b93a61..4bcd4e838b47 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1987,11 +1987,10 @@ wdata_prepare_pages(struct cifs_writedata *wdata, unsigned int found_pages, for (i = 0; i < found_pages; i++) { page = wdata->pages[i]; /* - * At this point we hold neither mapping->tree_lock nor - * lock on the page itself: the page may be truncated or - * invalidated (changing page->mapping to NULL), or even - * swizzled back from swapper_space to tmpfs file - * mapping + * At this point we hold neither the i_pages lock nor the + * page lock: the page may be truncated or invalidated + * (changing page->mapping to NULL), or even swizzled + * back from swapper_space to tmpfs file mapping */ if (nr_pages == 0) diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index 8d4b7bc8ae91..25d3f66b2d50 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -23,11 +23,63 @@ #include "cifs_debug.h" #include "cifs_fs_sb.h" +/* + * Key layout of CIFS server cache index object + */ +struct cifs_server_key { + struct { + uint16_t family; /* address family */ + __be16 port; /* IP port */ + } hdr; + union { + struct in_addr ipv4_addr; + struct in6_addr ipv6_addr; + }; +} __packed; + +/* + * Get a cookie for a server object keyed by {IPaddress,port,family} tuple + */ void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) { + const struct sockaddr *sa = (struct sockaddr *) &server->dstaddr; + const struct sockaddr_in *addr = (struct sockaddr_in *) sa; + const struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) sa; + struct cifs_server_key key; + uint16_t key_len = sizeof(key.hdr); + + memset(&key, 0, sizeof(key)); + + /* + * Should not be a problem as sin_family/sin6_family overlays + * sa_family field + */ + key.hdr.family = sa->sa_family; + switch (sa->sa_family) { + case AF_INET: + key.hdr.port = addr->sin_port; + key.ipv4_addr = addr->sin_addr; + key_len += sizeof(key.ipv4_addr); + break; + + case AF_INET6: + key.hdr.port = addr6->sin6_port; + key.ipv6_addr = addr6->sin6_addr; + key_len += sizeof(key.ipv6_addr); + break; + + default: + cifs_dbg(VFS, "Unknown network family '%d'\n", sa->sa_family); + server->fscache = NULL; + return; + } + server->fscache = fscache_acquire_cookie(cifs_fscache_netfs.primary_index, - &cifs_fscache_server_index_def, server, true); + &cifs_fscache_server_index_def, + &key, key_len, + NULL, 0, + server, 0, true); cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, server, server->fscache); } @@ -36,17 +88,29 @@ void cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) { cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, server, server->fscache); - fscache_relinquish_cookie(server->fscache, 0); + fscache_relinquish_cookie(server->fscache, NULL, false); server->fscache = NULL; } void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) { struct TCP_Server_Info *server = tcon->ses->server; + char *sharename; + + sharename = extract_sharename(tcon->treeName); + if (IS_ERR(sharename)) { + cifs_dbg(FYI, "%s: couldn't extract sharename\n", __func__); + tcon->fscache = NULL; + return; + } tcon->fscache = fscache_acquire_cookie(server->fscache, - &cifs_fscache_super_index_def, tcon, true); + &cifs_fscache_super_index_def, + sharename, strlen(sharename), + &tcon->resource_id, sizeof(tcon->resource_id), + tcon, 0, true); + kfree(sharename); cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, server->fscache, tcon->fscache); } @@ -54,10 +118,28 @@ void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon) { cifs_dbg(FYI, "%s: (0x%p)\n", __func__, tcon->fscache); - fscache_relinquish_cookie(tcon->fscache, 0); + fscache_relinquish_cookie(tcon->fscache, &tcon->resource_id, false); tcon->fscache = NULL; } +static void cifs_fscache_acquire_inode_cookie(struct cifsInodeInfo *cifsi, + struct cifs_tcon *tcon) +{ + struct cifs_fscache_inode_auxdata auxdata; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.eof = cifsi->server_eof; + auxdata.last_write_time = cifsi->vfs_inode.i_mtime; + auxdata.last_change_time = cifsi->vfs_inode.i_ctime; + + cifsi->fscache = + fscache_acquire_cookie(tcon->fscache, + &cifs_fscache_inode_object_def, + &cifsi->uniqueid, sizeof(cifsi->uniqueid), + &auxdata, sizeof(auxdata), + cifsi, cifsi->vfs_inode.i_size, true); +} + static void cifs_fscache_enable_inode_cookie(struct inode *inode) { struct cifsInodeInfo *cifsi = CIFS_I(inode); @@ -67,21 +149,28 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode) if (cifsi->fscache) return; - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) { - cifsi->fscache = fscache_acquire_cookie(tcon->fscache, - &cifs_fscache_inode_object_def, cifsi, true); - cifs_dbg(FYI, "%s: got FH cookie (0x%p/0x%p)\n", - __func__, tcon->fscache, cifsi->fscache); - } + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE)) + return; + + cifs_fscache_acquire_inode_cookie(cifsi, tcon); + + cifs_dbg(FYI, "%s: got FH cookie (0x%p/0x%p)\n", + __func__, tcon->fscache, cifsi->fscache); } void cifs_fscache_release_inode_cookie(struct inode *inode) { + struct cifs_fscache_inode_auxdata auxdata; struct cifsInodeInfo *cifsi = CIFS_I(inode); if (cifsi->fscache) { + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.eof = cifsi->server_eof; + auxdata.last_write_time = cifsi->vfs_inode.i_mtime; + auxdata.last_change_time = cifsi->vfs_inode.i_ctime; + cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache); - fscache_relinquish_cookie(cifsi->fscache, 0); + fscache_relinquish_cookie(cifsi->fscache, &auxdata, false); cifsi->fscache = NULL; } } @@ -93,7 +182,7 @@ static void cifs_fscache_disable_inode_cookie(struct inode *inode) if (cifsi->fscache) { cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache); fscache_uncache_all_inode_pages(cifsi->fscache, inode); - fscache_relinquish_cookie(cifsi->fscache, 1); + fscache_relinquish_cookie(cifsi->fscache, NULL, true); cifsi->fscache = NULL; } } @@ -110,16 +199,14 @@ void cifs_fscache_reset_inode_cookie(struct inode *inode) { struct cifsInodeInfo *cifsi = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); struct fscache_cookie *old = cifsi->fscache; if (cifsi->fscache) { /* retire the current fscache cache and get a new one */ - fscache_relinquish_cookie(cifsi->fscache, 1); + fscache_relinquish_cookie(cifsi->fscache, NULL, true); - cifsi->fscache = fscache_acquire_cookie( - cifs_sb_master_tcon(cifs_sb)->fscache, - &cifs_fscache_inode_object_def, - cifsi, true); + cifs_fscache_acquire_inode_cookie(cifsi, tcon); cifs_dbg(FYI, "%s: new cookie 0x%p oldcookie 0x%p\n", __func__, cifsi->fscache, old); } @@ -214,13 +301,15 @@ int __cifs_readpages_from_fscache(struct inode *inode, void __cifs_readpage_to_fscache(struct inode *inode, struct page *page) { + struct cifsInodeInfo *cifsi = CIFS_I(inode); int ret; cifs_dbg(FYI, "%s: (fsc: %p, p: %p, i: %p)\n", - __func__, CIFS_I(inode)->fscache, page, inode); - ret = fscache_write_page(CIFS_I(inode)->fscache, page, GFP_KERNEL); + __func__, cifsi->fscache, page, inode); + ret = fscache_write_page(cifsi->fscache, page, + cifsi->vfs_inode.i_size, GFP_KERNEL); if (ret != 0) - fscache_uncache_page(CIFS_I(inode)->fscache, page); + fscache_uncache_page(cifsi->fscache, page); } void __cifs_fscache_readpages_cancel(struct inode *inode, struct list_head *pages) @@ -239,4 +328,3 @@ void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode) fscache_wait_on_page_write(cookie, page); fscache_uncache_page(cookie, page); } - diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h index 24794b6cd8ec..c7e3ac251e16 100644 --- a/fs/cifs/fscache.h +++ b/fs/cifs/fscache.h @@ -27,6 +27,18 @@ #ifdef CONFIG_CIFS_FSCACHE +/* + * Auxiliary data attached to CIFS inode within the cache + */ +struct cifs_fscache_inode_auxdata { + struct timespec last_write_time; + struct timespec last_change_time; + u64 eof; +}; + +/* + * cache.c + */ extern struct fscache_netfs cifs_fscache_netfs; extern const struct fscache_cookie_def cifs_fscache_server_index_def; extern const struct fscache_cookie_def cifs_fscache_super_index_def; @@ -34,6 +46,7 @@ extern const struct fscache_cookie_def cifs_fscache_inode_object_def; extern int cifs_fscache_register(void); extern void cifs_fscache_unregister(void); +extern char *extract_sharename(const char *); /* * fscache.c diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index f856df4adae3..3c371f7f5963 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -710,7 +710,7 @@ cgfi_exit: /* Simple function to return a 64 bit hash of string. Rarely called */ static __u64 simple_hashstr(const char *str) { - const __u64 hash_mult = 1125899906842597L; /* a big enough prime */ + const __u64 hash_mult = 1125899906842597ULL; /* a big enough prime */ __u64 hash = 0; while (*str) diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 5406e95f5d92..68ea8491c160 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -93,6 +93,43 @@ static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = { /* SMB2_OPLOCK_BREAK */ cpu_to_le16(24) }; +#ifdef CONFIG_CIFS_SMB311 +static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len, __u32 non_ctxlen, + size_t hdr_preamble_size) +{ + __u16 neg_count; + __u32 nc_offset, size_of_pad_before_neg_ctxts; + struct smb2_negotiate_rsp *pneg_rsp = (struct smb2_negotiate_rsp *)hdr; + + /* Negotiate contexts are only valid for latest dialect SMB3.11 */ + neg_count = le16_to_cpu(pneg_rsp->NegotiateContextCount); + if ((neg_count == 0) || + (pneg_rsp->DialectRevision != cpu_to_le16(SMB311_PROT_ID))) + return 0; + + /* Make sure that negotiate contexts start after gss security blob */ + nc_offset = le32_to_cpu(pneg_rsp->NegotiateContextOffset); + if (nc_offset < non_ctxlen - hdr_preamble_size /* RFC1001 len */) { + printk_once(KERN_WARNING "invalid negotiate context offset\n"); + return 0; + } + size_of_pad_before_neg_ctxts = nc_offset - + (non_ctxlen - hdr_preamble_size); + + /* Verify that at least minimal negotiate contexts fit within frame */ + if (len < nc_offset + (neg_count * sizeof(struct smb2_neg_context))) { + printk_once(KERN_WARNING "negotiate context goes beyond end\n"); + return 0; + } + + cifs_dbg(FYI, "length of negcontexts %d pad %d\n", + len - nc_offset, size_of_pad_before_neg_ctxts); + + /* length of negcontexts including pad from end of sec blob to them */ + return (len - nc_offset) + size_of_pad_before_neg_ctxts; +} +#endif /* CIFS_SMB311 */ + int smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr) { @@ -198,6 +235,11 @@ smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr) clc_len = smb2_calc_size(hdr); +#ifdef CONFIG_CIFS_SMB311 + if (shdr->Command == SMB2_NEGOTIATE) + clc_len += get_neg_ctxt_len(hdr, len, clc_len, + srvr->vals->header_preamble_size); +#endif /* SMB311 */ if (srvr->vals->header_preamble_size + len != clc_len) { cifs_dbg(FYI, "Calculated size %u length %zu mismatch mid %llu\n", clc_len, srvr->vals->header_preamble_size + len, mid); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 968b1d43a1ea..b4ae932ea134 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1451,6 +1451,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; struct cifs_open_parms oparms; struct cifs_fid fid; + struct kvec err_iov = {NULL, 0}; struct smb2_err_rsp *err_buf = NULL; struct smb2_symlink_err_rsp *symlink; unsigned int sub_len; @@ -1473,15 +1474,16 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, oparms.fid = &fid; oparms.reconnect = false; - rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, &err_buf); + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, &err_iov); if (!rc || !err_buf) { kfree(utf16_path); return -ENOENT; } + err_buf = err_iov.iov_base; if (le32_to_cpu(err_buf->ByteCount) < sizeof(struct smb2_symlink_err_rsp) || - get_rfc1002_length(err_buf) + server->vals->header_preamble_size < SMB2_SYMLINK_STRUCT_SIZE) { + err_iov.iov_len + server->vals->header_preamble_size < SMB2_SYMLINK_STRUCT_SIZE) { kfree(utf16_path); return -ENOENT; } @@ -1494,13 +1496,13 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, print_len = le16_to_cpu(symlink->PrintNameLength); print_offset = le16_to_cpu(symlink->PrintNameOffset); - if (get_rfc1002_length(err_buf) + server->vals->header_preamble_size < + if (err_iov.iov_len + server->vals->header_preamble_size < SMB2_SYMLINK_STRUCT_SIZE + sub_offset + sub_len) { kfree(utf16_path); return -ENOENT; } - if (get_rfc1002_length(err_buf) + server->vals->header_preamble_size < + if (err_iov.iov_len + server->vals->header_preamble_size < SMB2_SYMLINK_STRUCT_SIZE + print_offset + print_len) { kfree(utf16_path); return -ENOENT; @@ -2550,7 +2552,7 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid) unsigned int npages; struct page **pages; unsigned int len; - unsigned int buflen = get_rfc1002_length(buf) + server->vals->header_preamble_size; + unsigned int buflen = server->pdu_size + server->vals->header_preamble_size; int rc; int i = 0; @@ -2624,7 +2626,7 @@ receive_encrypted_standard(struct TCP_Server_Info *server, { int length; char *buf = server->smallbuf; - unsigned int pdu_length = get_rfc1002_length(buf); + unsigned int pdu_length = server->pdu_size; unsigned int buf_size; struct mid_q_entry *mid_entry; @@ -2668,7 +2670,7 @@ static int smb3_receive_transform(struct TCP_Server_Info *server, struct mid_q_entry **mid) { char *buf = server->smallbuf; - unsigned int pdu_length = get_rfc1002_length(buf); + unsigned int pdu_length = server->pdu_size; struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)buf; unsigned int orig_len = le32_to_cpu(tr_hdr->OriginalMessageSize); @@ -2699,7 +2701,7 @@ smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid) { char *buf = server->large_buf ? server->bigbuf : server->smallbuf; - return handle_read_data(server, mid, buf, get_rfc1002_length(buf) + + return handle_read_data(server, mid, buf, server->pdu_size + server->vals->header_preamble_size, NULL, 0, 0); } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index f7741cee2a4c..0f044c4a2dc9 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -268,8 +268,11 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) mutex_unlock(&tcon->ses->session_mutex); cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc); - if (rc) + if (rc) { + /* If sess reconnected but tcon didn't, something strange ... */ + printk_once(KERN_WARNING "reconnect tcon failed rc = %d\n", rc); goto out; + } if (smb2_command != SMB2_INTERNAL_CMD) queue_delayed_work(cifsiod_wq, &server->reconnect, 0); @@ -403,6 +406,100 @@ assemble_neg_contexts(struct smb2_negotiate_req *req, *total_len += 4 + sizeof(struct smb2_preauth_neg_context) + sizeof(struct smb2_encryption_neg_context); } + +static void decode_preauth_context(struct smb2_preauth_neg_context *ctxt) +{ + unsigned int len = le16_to_cpu(ctxt->DataLength); + + /* If invalid preauth context warn but use what we requested, SHA-512 */ + if (len < MIN_PREAUTH_CTXT_DATA_LEN) { + printk_once(KERN_WARNING "server sent bad preauth context\n"); + return; + } + if (le16_to_cpu(ctxt->HashAlgorithmCount) != 1) + printk_once(KERN_WARNING "illegal SMB3 hash algorithm count\n"); + if (ctxt->HashAlgorithms != SMB2_PREAUTH_INTEGRITY_SHA512) + printk_once(KERN_WARNING "unknown SMB3 hash algorithm\n"); +} + +static int decode_encrypt_ctx(struct TCP_Server_Info *server, + struct smb2_encryption_neg_context *ctxt) +{ + unsigned int len = le16_to_cpu(ctxt->DataLength); + + cifs_dbg(FYI, "decode SMB3.11 encryption neg context of len %d\n", len); + if (len < MIN_ENCRYPT_CTXT_DATA_LEN) { + printk_once(KERN_WARNING "server sent bad crypto ctxt len\n"); + return -EINVAL; + } + + if (le16_to_cpu(ctxt->CipherCount) != 1) { + printk_once(KERN_WARNING "illegal SMB3.11 cipher count\n"); + return -EINVAL; + } + cifs_dbg(FYI, "SMB311 cipher type:%d\n", le16_to_cpu(ctxt->Ciphers[0])); + if ((ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES128_CCM) && + (ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES128_GCM)) { + printk_once(KERN_WARNING "invalid SMB3.11 cipher returned\n"); + return -EINVAL; + } + server->cipher_type = ctxt->Ciphers[0]; + return 0; +} + +static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp, + struct TCP_Server_Info *server) +{ + struct smb2_neg_context *pctx; + unsigned int offset = le32_to_cpu(rsp->NegotiateContextOffset); + unsigned int ctxt_cnt = le16_to_cpu(rsp->NegotiateContextCount); + unsigned int len_of_smb = be32_to_cpu(rsp->hdr.smb2_buf_length); + unsigned int len_of_ctxts, i; + int rc = 0; + + cifs_dbg(FYI, "decoding %d negotiate contexts\n", ctxt_cnt); + if (len_of_smb <= offset) { + cifs_dbg(VFS, "Invalid response: negotiate context offset\n"); + return -EINVAL; + } + + len_of_ctxts = len_of_smb - offset; + + for (i = 0; i < ctxt_cnt; i++) { + int clen; + /* check that offset is not beyond end of SMB */ + if (len_of_ctxts == 0) + break; + + if (len_of_ctxts < sizeof(struct smb2_neg_context)) + break; + + pctx = (struct smb2_neg_context *)(offset + + server->vals->header_preamble_size + (char *)rsp); + clen = le16_to_cpu(pctx->DataLength); + if (clen > len_of_ctxts) + break; + + if (pctx->ContextType == SMB2_PREAUTH_INTEGRITY_CAPABILITIES) + decode_preauth_context( + (struct smb2_preauth_neg_context *)pctx); + else if (pctx->ContextType == SMB2_ENCRYPTION_CAPABILITIES) + rc = decode_encrypt_ctx(server, + (struct smb2_encryption_neg_context *)pctx); + else + cifs_dbg(VFS, "unknown negcontext of type %d ignored\n", + le16_to_cpu(pctx->ContextType)); + + if (rc) + break; + /* offsets must be 8 byte aligned */ + clen = (clen + 7) & ~0x7; + offset += clen + sizeof(struct smb2_neg_context); + len_of_ctxts -= clen; + } + return rc; +} + #else static void assemble_neg_contexts(struct smb2_negotiate_req *req, unsigned int *total_len) @@ -616,6 +713,15 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) else if (rc == 0) rc = -EIO; } + +#ifdef CONFIG_CIFS_SMB311 + if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID)) { + if (rsp->NegotiateContextCount) + rc = smb311_decode_neg_context(rsp, server); + else + cifs_dbg(VFS, "Missing expected negotiate contexts\n"); + } +#endif /* CONFIG_CIFS_SMB311 */ neg_exit: free_rsp_buf(resp_buftype, rsp); return rc; @@ -1026,7 +1132,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) if (rc) goto out; - if (offsetof(struct smb2_sess_setup_rsp, Buffer) - 4 != + if (offsetof(struct smb2_sess_setup_rsp, Buffer) - ses->server->vals->header_preamble_size != le16_to_cpu(rsp->SecurityBufferOffset)) { cifs_dbg(VFS, "Invalid security buffer offset %d\n", le16_to_cpu(rsp->SecurityBufferOffset)); @@ -1701,7 +1807,7 @@ alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len, int SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, __u8 *oplock, struct smb2_file_all_info *buf, - struct smb2_err_rsp **err_buf) + struct kvec *err_iov) { struct smb2_create_req *req; struct smb2_create_rsp *rsp; @@ -1841,9 +1947,11 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, if (rc != 0) { cifs_stats_fail_inc(tcon, SMB2_CREATE_HE); - if (err_buf && rsp) - *err_buf = kmemdup(rsp, get_rfc1002_length(rsp) + 4, - GFP_KERNEL); + if (err_iov && rsp) { + *err_iov = rsp_iov; + resp_buftype = CIFS_NO_BUFFER; + rsp = NULL; + } goto creat_exit; } @@ -2098,13 +2206,13 @@ close_exit: } static int -validate_buf(unsigned int offset, unsigned int buffer_length, - struct smb2_hdr *hdr, unsigned int min_buf_size) - +validate_iov(struct TCP_Server_Info *server, + unsigned int offset, unsigned int buffer_length, + struct kvec *iov, unsigned int min_buf_size) { - unsigned int smb_len = be32_to_cpu(hdr->smb2_buf_length); - char *end_of_smb = smb_len + 4 /* RFC1001 length field */ + (char *)hdr; - char *begin_of_buf = 4 /* RFC1001 len field */ + offset + (char *)hdr; + unsigned int smb_len = iov->iov_len; + char *end_of_smb = smb_len + server->vals->header_preamble_size + (char *)iov->iov_base; + char *begin_of_buf = server->vals->header_preamble_size + offset + (char *)iov->iov_base; char *end_of_buf = begin_of_buf + buffer_length; @@ -2134,18 +2242,18 @@ validate_buf(unsigned int offset, unsigned int buffer_length, * Caller must free buffer. */ static int -validate_and_copy_buf(unsigned int offset, unsigned int buffer_length, - struct smb2_hdr *hdr, unsigned int minbufsize, +validate_and_copy_iov(struct TCP_Server_Info *server, + unsigned int offset, unsigned int buffer_length, + struct kvec *iov, unsigned int minbufsize, char *data) - { - char *begin_of_buf = 4 /* RFC1001 len field */ + offset + (char *)hdr; + char *begin_of_buf = server->vals->header_preamble_size + offset + (char *)(iov->iov_base); int rc; if (!data) return -EINVAL; - rc = validate_buf(offset, buffer_length, hdr, minbufsize); + rc = validate_iov(server, offset, buffer_length, iov, minbufsize); if (rc) return rc; @@ -2223,9 +2331,10 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, } } - rc = validate_and_copy_buf(le16_to_cpu(rsp->OutputBufferOffset), + rc = validate_and_copy_iov(ses->server, + le16_to_cpu(rsp->OutputBufferOffset), le32_to_cpu(rsp->OutputBufferLength), - &rsp->hdr, min_len, *data); + &rsp_iov, min_len, *data); qinf_exit: free_rsp_buf(resp_buftype, rsp); @@ -3146,8 +3255,9 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, goto qdir_exit; } - rc = validate_buf(le16_to_cpu(rsp->OutputBufferOffset), - le32_to_cpu(rsp->OutputBufferLength), &rsp->hdr, + rc = validate_iov(server, + le16_to_cpu(rsp->OutputBufferOffset), + le32_to_cpu(rsp->OutputBufferLength), &rsp_iov, info_buf_size); if (rc) goto qdir_exit; @@ -3454,7 +3564,7 @@ static int build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level, int outbuf_len, u64 persistent_fid, u64 volatile_fid) { - struct TCP_Server_Info *server = tcon->ses->server; + struct TCP_Server_Info *server; int rc; struct smb2_query_info_req *req; unsigned int total_len; @@ -3464,6 +3574,8 @@ build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level, if ((tcon->ses == NULL) || (tcon->ses->server == NULL)) return -EIO; + server = tcon->ses->server; + rc = smb2_plain_req_init(SMB2_QUERY_INFO, tcon, (void **) &req, &total_len); if (rc) @@ -3517,8 +3629,9 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, info = (struct smb2_fs_full_size_info *)(server->vals->header_preamble_size + le16_to_cpu(rsp->OutputBufferOffset) + (char *)&rsp->hdr); - rc = validate_buf(le16_to_cpu(rsp->OutputBufferOffset), - le32_to_cpu(rsp->OutputBufferLength), &rsp->hdr, + rc = validate_iov(server, + le16_to_cpu(rsp->OutputBufferOffset), + le32_to_cpu(rsp->OutputBufferLength), &rsp_iov, sizeof(struct smb2_fs_full_size_info)); if (!rc) copy_fs_info_to_kstatfs(info, fsdata); @@ -3574,7 +3687,7 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, rsp_len = le32_to_cpu(rsp->OutputBufferLength); offset = le16_to_cpu(rsp->OutputBufferOffset); - rc = validate_buf(offset, rsp_len, &rsp->hdr, min_len); + rc = validate_iov(server, offset, rsp_len, &rsp_iov, min_len); if (rc) goto qfsattr_exit; diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 253e2c7c952f..6093e5142b2b 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -263,11 +263,19 @@ struct smb2_negotiate_req { #define SMB2_NT_FIND 0x00100000 #define SMB2_LARGE_FILES 0x00200000 +struct smb2_neg_context { + __le16 ContextType; + __le16 DataLength; + __le32 Reserved; + /* Followed by array of data */ +} __packed; + #define SMB311_SALT_SIZE 32 /* Hash Algorithm Types */ #define SMB2_PREAUTH_INTEGRITY_SHA512 cpu_to_le16(0x0001) #define SMB2_PREAUTH_HASH_SIZE 64 +#define MIN_PREAUTH_CTXT_DATA_LEN (SMB311_SALT_SIZE + 6) struct smb2_preauth_neg_context { __le16 ContextType; /* 1 */ __le16 DataLength; @@ -282,6 +290,8 @@ struct smb2_preauth_neg_context { #define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001) #define SMB2_ENCRYPTION_AES128_GCM cpu_to_le16(0x0002) +/* Min encrypt context data is one cipher so 2 bytes + 2 byte count field */ +#define MIN_ENCRYPT_CTXT_DATA_LEN 4 struct smb2_encryption_neg_context { __le16 ContextType; /* 2 */ __le16 DataLength; diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index cbcce3f7e86f..8ba24a95db71 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -122,7 +122,7 @@ extern int SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon); extern int SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, __u8 *oplock, struct smb2_file_all_info *buf, - struct smb2_err_rsp **err_buf); + struct kvec *err_iov); extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, u32 opcode, bool is_fsctl, char *in_data, u32 indatalen, diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index bf49cb73b9e6..8806f3f76c1d 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -604,7 +604,7 @@ int smb2_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, bool log_error) { - unsigned int len = get_rfc1002_length(mid->resp_buf); + unsigned int len = mid->resp_buf_size; struct kvec iov[2]; struct smb_rqst rqst = { .rq_iov = iov, .rq_nvec = 2 }; diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 279718dcb2ed..8f6f25918229 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -790,7 +790,7 @@ cifs_send_recv(const unsigned int xid, struct cifs_ses *ses, buf = (char *)midQ->resp_buf; resp_iov->iov_base = buf; - resp_iov->iov_len = get_rfc1002_length(buf) + + resp_iov->iov_len = midQ->resp_buf_size + ses->server->vals->header_preamble_size; if (midQ->large_buf) *resp_buf_type = CIFS_LARGE_BUFFER; @@ -73,16 +73,15 @@ fs_initcall(init_dax_wait_table); #define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) #define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)) -static unsigned long dax_radix_sector(void *entry) +static unsigned long dax_radix_pfn(void *entry) { return (unsigned long)entry >> RADIX_DAX_SHIFT; } -static void *dax_radix_locked_entry(sector_t sector, unsigned long flags) +static void *dax_radix_locked_entry(unsigned long pfn, unsigned long flags) { return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags | - ((unsigned long)sector << RADIX_DAX_SHIFT) | - RADIX_DAX_ENTRY_LOCK); + (pfn << RADIX_DAX_SHIFT) | RADIX_DAX_ENTRY_LOCK); } static unsigned int dax_radix_order(void *entry) @@ -159,11 +158,9 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo } /* - * We do not necessarily hold the mapping->tree_lock when we call this - * function so it is possible that 'entry' is no longer a valid item in the - * radix tree. This is okay because all we really need to do is to find the - * correct waitqueue where tasks might be waiting for that old 'entry' and - * wake them. + * @entry may no longer be the entry at the index in the mapping. + * The important information it's conveying is whether the entry at + * this index used to be a PMD entry. */ static void dax_wake_mapping_entry_waiter(struct address_space *mapping, pgoff_t index, void *entry, bool wake_all) @@ -175,7 +172,7 @@ static void dax_wake_mapping_entry_waiter(struct address_space *mapping, /* * Checking for locked entry and prepare_to_wait_exclusive() happens - * under mapping->tree_lock, ditto for entry handling in our callers. + * under the i_pages lock, ditto for entry handling in our callers. * So at this point all tasks that could have seen our entry locked * must be in the waitqueue and the following check will see them. */ @@ -184,41 +181,39 @@ static void dax_wake_mapping_entry_waiter(struct address_space *mapping, } /* - * Check whether the given slot is locked. The function must be called with - * mapping->tree_lock held + * Check whether the given slot is locked. Must be called with the i_pages + * lock held. */ static inline int slot_locked(struct address_space *mapping, void **slot) { unsigned long entry = (unsigned long) - radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); return entry & RADIX_DAX_ENTRY_LOCK; } /* - * Mark the given slot is locked. The function must be called with - * mapping->tree_lock held + * Mark the given slot as locked. Must be called with the i_pages lock held. */ static inline void *lock_slot(struct address_space *mapping, void **slot) { unsigned long entry = (unsigned long) - radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); entry |= RADIX_DAX_ENTRY_LOCK; - radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); + radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry); return (void *)entry; } /* - * Mark the given slot is unlocked. The function must be called with - * mapping->tree_lock held + * Mark the given slot as unlocked. Must be called with the i_pages lock held. */ static inline void *unlock_slot(struct address_space *mapping, void **slot) { unsigned long entry = (unsigned long) - radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; - radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); + radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry); return (void *)entry; } @@ -229,7 +224,7 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot) * put_locked_mapping_entry() when he locked the entry and now wants to * unlock it. * - * The function must be called with mapping->tree_lock held. + * Must be called with the i_pages lock held. */ static void *get_unlocked_mapping_entry(struct address_space *mapping, pgoff_t index, void ***slotp) @@ -242,7 +237,7 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping, ewait.wait.func = wake_exceptional_entry_func; for (;;) { - entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, + entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot); if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) || @@ -255,10 +250,10 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping, wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key); prepare_to_wait_exclusive(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); schedule(); finish_wait(wq, &ewait.wait); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); } } @@ -267,15 +262,15 @@ static void dax_unlock_mapping_entry(struct address_space *mapping, { void *entry, **slot; - spin_lock_irq(&mapping->tree_lock); - entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); + xa_lock_irq(&mapping->i_pages); + entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot); if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || !slot_locked(mapping, slot))) { - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return; } unlock_slot(mapping, slot); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); dax_wake_mapping_entry_waiter(mapping, index, entry, false); } @@ -299,6 +294,63 @@ static void put_unlocked_mapping_entry(struct address_space *mapping, dax_wake_mapping_entry_waiter(mapping, index, entry, false); } +static unsigned long dax_entry_size(void *entry) +{ + if (dax_is_zero_entry(entry)) + return 0; + else if (dax_is_empty_entry(entry)) + return 0; + else if (dax_is_pmd_entry(entry)) + return PMD_SIZE; + else + return PAGE_SIZE; +} + +static unsigned long dax_radix_end_pfn(void *entry) +{ + return dax_radix_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE; +} + +/* + * Iterate through all mapped pfns represented by an entry, i.e. skip + * 'empty' and 'zero' entries. + */ +#define for_each_mapped_pfn(entry, pfn) \ + for (pfn = dax_radix_pfn(entry); \ + pfn < dax_radix_end_pfn(entry); pfn++) + +static void dax_associate_entry(void *entry, struct address_space *mapping) +{ + unsigned long pfn; + + if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) + return; + + for_each_mapped_pfn(entry, pfn) { + struct page *page = pfn_to_page(pfn); + + WARN_ON_ONCE(page->mapping); + page->mapping = mapping; + } +} + +static void dax_disassociate_entry(void *entry, struct address_space *mapping, + bool trunc) +{ + unsigned long pfn; + + if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) + return; + + for_each_mapped_pfn(entry, pfn) { + struct page *page = pfn_to_page(pfn); + + WARN_ON_ONCE(trunc && page_ref_count(page) > 1); + WARN_ON_ONCE(page->mapping && page->mapping != mapping); + page->mapping = NULL; + } +} + /* * Find radix tree entry at given index. If it points to an exceptional entry, * return it with the radix tree entry locked. If the radix tree doesn't @@ -332,7 +384,7 @@ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, void *entry, **slot; restart: - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); entry = get_unlocked_mapping_entry(mapping, index, &slot); if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) { @@ -364,12 +416,12 @@ restart: if (pmd_downgrade) { /* * Make sure 'entry' remains valid while we drop - * mapping->tree_lock. + * the i_pages lock. */ entry = lock_slot(mapping, slot); } - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); /* * Besides huge zero pages the only other thing that gets * downgraded are empty entries which don't need to be @@ -386,26 +438,27 @@ restart: put_locked_mapping_entry(mapping, index); return ERR_PTR(err); } - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); if (!entry) { /* - * We needed to drop the page_tree lock while calling + * We needed to drop the i_pages lock while calling * radix_tree_preload() and we didn't have an entry to * lock. See if another thread inserted an entry at * our index during this time. */ - entry = __radix_tree_lookup(&mapping->page_tree, index, + entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot); if (entry) { radix_tree_preload_end(); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); goto restart; } } if (pmd_downgrade) { - radix_tree_delete(&mapping->page_tree, index); + dax_disassociate_entry(entry, mapping, false); + radix_tree_delete(&mapping->i_pages, index); mapping->nrexceptional--; dax_wake_mapping_entry_waiter(mapping, index, entry, true); @@ -413,11 +466,11 @@ restart: entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY); - err = __radix_tree_insert(&mapping->page_tree, index, + err = __radix_tree_insert(&mapping->i_pages, index, dax_radix_order(entry), entry); radix_tree_preload_end(); if (err) { - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); /* * Our insertion of a DAX entry failed, most likely * because we were inserting a PMD entry and it @@ -430,12 +483,12 @@ restart: } /* Good, we have inserted empty locked entry into the tree. */ mapping->nrexceptional++; - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return entry; } entry = lock_slot(mapping, slot); out_unlock: - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return entry; } @@ -444,22 +497,23 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping, { int ret = 0; void *entry; - struct radix_tree_root *page_tree = &mapping->page_tree; + struct radix_tree_root *pages = &mapping->i_pages; - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(pages); entry = get_unlocked_mapping_entry(mapping, index, NULL); if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry))) goto out; if (!trunc && - (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || - radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))) + (radix_tree_tag_get(pages, index, PAGECACHE_TAG_DIRTY) || + radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE))) goto out; - radix_tree_delete(page_tree, index); + dax_disassociate_entry(entry, mapping, trunc); + radix_tree_delete(pages, index); mapping->nrexceptional--; ret = 1; out: put_unlocked_mapping_entry(mapping, index, entry); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(pages); return ret; } /* @@ -526,12 +580,13 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, */ static void *dax_insert_mapping_entry(struct address_space *mapping, struct vm_fault *vmf, - void *entry, sector_t sector, + void *entry, pfn_t pfn_t, unsigned long flags, bool dirty) { - struct radix_tree_root *page_tree = &mapping->page_tree; - void *new_entry; + struct radix_tree_root *pages = &mapping->i_pages; + unsigned long pfn = pfn_t_to_pfn(pfn_t); pgoff_t index = vmf->pgoff; + void *new_entry; if (dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); @@ -545,8 +600,12 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, unmap_mapping_pages(mapping, vmf->pgoff, 1, false); } - spin_lock_irq(&mapping->tree_lock); - new_entry = dax_radix_locked_entry(sector, flags); + xa_lock_irq(pages); + new_entry = dax_radix_locked_entry(pfn, flags); + if (dax_entry_size(entry) != dax_entry_size(new_entry)) { + dax_disassociate_entry(entry, mapping, false); + dax_associate_entry(new_entry, mapping); + } if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { /* @@ -561,17 +620,17 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, void **slot; void *ret; - ret = __radix_tree_lookup(page_tree, index, &node, &slot); + ret = __radix_tree_lookup(pages, index, &node, &slot); WARN_ON_ONCE(ret != entry); - __radix_tree_replace(page_tree, node, slot, + __radix_tree_replace(pages, node, slot, new_entry, NULL); entry = new_entry; } if (dirty) - radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); + radix_tree_tag_set(pages, index, PAGECACHE_TAG_DIRTY); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(pages); return entry; } @@ -657,17 +716,14 @@ unlock_pte: i_mmap_unlock_read(mapping); } -static int dax_writeback_one(struct block_device *bdev, - struct dax_device *dax_dev, struct address_space *mapping, - pgoff_t index, void *entry) +static int dax_writeback_one(struct dax_device *dax_dev, + struct address_space *mapping, pgoff_t index, void *entry) { - struct radix_tree_root *page_tree = &mapping->page_tree; - void *entry2, **slot, *kaddr; - long ret = 0, id; - sector_t sector; - pgoff_t pgoff; + struct radix_tree_root *pages = &mapping->i_pages; + void *entry2, **slot; + unsigned long pfn; + long ret = 0; size_t size; - pfn_t pfn; /* * A page got tagged dirty in DAX mapping? Something is seriously @@ -676,17 +732,17 @@ static int dax_writeback_one(struct block_device *bdev, if (WARN_ON(!radix_tree_exceptional_entry(entry))) return -EIO; - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(pages); entry2 = get_unlocked_mapping_entry(mapping, index, &slot); /* Entry got punched out / reallocated? */ if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2))) goto put_unlocked; /* * Entry got reallocated elsewhere? No need to writeback. We have to - * compare sectors as we must not bail out due to difference in lockbit + * compare pfns as we must not bail out due to difference in lockbit * or entry type. */ - if (dax_radix_sector(entry2) != dax_radix_sector(entry)) + if (dax_radix_pfn(entry2) != dax_radix_pfn(entry)) goto put_unlocked; if (WARN_ON_ONCE(dax_is_empty_entry(entry) || dax_is_zero_entry(entry))) { @@ -695,7 +751,7 @@ static int dax_writeback_one(struct block_device *bdev, } /* Another fsync thread may have already written back this entry */ - if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) + if (!radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE)) goto put_unlocked; /* Lock the entry to serialize with page faults */ entry = lock_slot(mapping, slot); @@ -703,60 +759,40 @@ static int dax_writeback_one(struct block_device *bdev, * We can clear the tag now but we have to be careful so that concurrent * dax_writeback_one() calls for the same index cannot finish before we * actually flush the caches. This is achieved as the calls will look - * at the entry only under tree_lock and once they do that they will - * see the entry locked and wait for it to unlock. + * at the entry only under the i_pages lock and once they do that + * they will see the entry locked and wait for it to unlock. */ - radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); - spin_unlock_irq(&mapping->tree_lock); + radix_tree_tag_clear(pages, index, PAGECACHE_TAG_TOWRITE); + xa_unlock_irq(pages); /* * Even if dax_writeback_mapping_range() was given a wbc->range_start * in the middle of a PMD, the 'index' we are given will be aligned to - * the start index of the PMD, as will the sector we pull from - * 'entry'. This allows us to flush for PMD_SIZE and not have to - * worry about partial PMD writebacks. + * the start index of the PMD, as will the pfn we pull from 'entry'. + * This allows us to flush for PMD_SIZE and not have to worry about + * partial PMD writebacks. */ - sector = dax_radix_sector(entry); + pfn = dax_radix_pfn(entry); size = PAGE_SIZE << dax_radix_order(entry); - id = dax_read_lock(); - ret = bdev_dax_pgoff(bdev, sector, size, &pgoff); - if (ret) - goto dax_unlock; - - /* - * dax_direct_access() may sleep, so cannot hold tree_lock over - * its invocation. - */ - ret = dax_direct_access(dax_dev, pgoff, size / PAGE_SIZE, &kaddr, &pfn); - if (ret < 0) - goto dax_unlock; - - if (WARN_ON_ONCE(ret < size / PAGE_SIZE)) { - ret = -EIO; - goto dax_unlock; - } - - dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn)); - dax_flush(dax_dev, kaddr, size); + dax_mapping_entry_mkclean(mapping, index, pfn); + dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size); /* * After we have flushed the cache, we can clear the dirty tag. There * cannot be new dirty data in the pfn after the flush has completed as * the pfn mappings are writeprotected and fault waits for mapping * entry lock. */ - spin_lock_irq(&mapping->tree_lock); - radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); - spin_unlock_irq(&mapping->tree_lock); + xa_lock_irq(pages); + radix_tree_tag_clear(pages, index, PAGECACHE_TAG_DIRTY); + xa_unlock_irq(pages); trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); - dax_unlock: - dax_read_unlock(id); put_locked_mapping_entry(mapping, index); return ret; put_unlocked: put_unlocked_mapping_entry(mapping, index, entry2); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(pages); return ret; } @@ -808,8 +844,8 @@ int dax_writeback_mapping_range(struct address_space *mapping, break; } - ret = dax_writeback_one(bdev, dax_dev, mapping, - indices[i], pvec.pages[i]); + ret = dax_writeback_one(dax_dev, mapping, indices[i], + pvec.pages[i]); if (ret < 0) { mapping_set_error(mapping, ret); goto out; @@ -877,6 +913,7 @@ static int dax_load_hole(struct address_space *mapping, void *entry, int ret = VM_FAULT_NOPAGE; struct page *zero_page; void *entry2; + pfn_t pfn; zero_page = ZERO_PAGE(0); if (unlikely(!zero_page)) { @@ -884,14 +921,15 @@ static int dax_load_hole(struct address_space *mapping, void *entry, goto out; } - entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0, + pfn = page_to_pfn_t(zero_page); + entry2 = dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE, false); if (IS_ERR(entry2)) { ret = VM_FAULT_SIGBUS; goto out; } - vm_insert_mixed(vmf->vma, vaddr, page_to_pfn_t(zero_page)); + vm_insert_mixed(vmf->vma, vaddr, pfn); out: trace_dax_load_hole(inode, vmf, ret); return ret; @@ -1200,8 +1238,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, if (error < 0) goto error_finish_iomap; - entry = dax_insert_mapping_entry(mapping, vmf, entry, - dax_iomap_sector(&iomap, pos), + entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn, 0, write && !sync); if (IS_ERR(entry)) { error = PTR_ERR(entry); @@ -1280,13 +1317,15 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, void *ret = NULL; spinlock_t *ptl; pmd_t pmd_entry; + pfn_t pfn; zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm); if (unlikely(!zero_page)) goto fallback; - ret = dax_insert_mapping_entry(mapping, vmf, entry, 0, + pfn = page_to_pfn_t(zero_page); + ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false); if (IS_ERR(ret)) goto fallback; @@ -1409,8 +1448,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, if (error < 0) goto finish_iomap; - entry = dax_insert_mapping_entry(mapping, vmf, entry, - dax_iomap_sector(&iomap, pos), + entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_PMD, write && !sync); if (IS_ERR(entry)) goto finish_iomap; @@ -1524,21 +1562,21 @@ static int dax_insert_pfn_mkwrite(struct vm_fault *vmf, pgoff_t index = vmf->pgoff; int vmf_ret, error; - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); entry = get_unlocked_mapping_entry(mapping, index, &slot); /* Did we race with someone splitting entry or so? */ if (!entry || (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) || (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) { put_unlocked_mapping_entry(mapping, index, entry); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf, VM_FAULT_NOPAGE); return VM_FAULT_NOPAGE; } - radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); + radix_tree_tag_set(&mapping->i_pages, index, PAGECACHE_TAG_DIRTY); entry = lock_slot(mapping, slot); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); switch (pe_size) { case PE_SIZE_PTE: error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); diff --git a/fs/dcache.c b/fs/dcache.c index 593079176123..86d2de63461e 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -257,11 +257,25 @@ static void __d_free(struct rcu_head *head) kmem_cache_free(dentry_cache, dentry); } +static void __d_free_external_name(struct rcu_head *head) +{ + struct external_name *name = container_of(head, struct external_name, + u.head); + + mod_node_page_state(page_pgdat(virt_to_page(name)), + NR_INDIRECTLY_RECLAIMABLE_BYTES, + -ksize(name)); + + kfree(name); +} + static void __d_free_external(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); - kfree(external_name(dentry)); - kmem_cache_free(dentry_cache, dentry); + + __d_free_external_name(&external_name(dentry)->u.head); + + kmem_cache_free(dentry_cache, dentry); } static inline int dname_external(const struct dentry *dentry) @@ -291,7 +305,7 @@ void release_dentry_name_snapshot(struct name_snapshot *name) struct external_name *p; p = container_of(name->name, struct external_name, name[0]); if (unlikely(atomic_dec_and_test(&p->u.count))) - kfree_rcu(p, u.head); + call_rcu(&p->u.head, __d_free_external_name); } } EXPORT_SYMBOL(release_dentry_name_snapshot); @@ -1038,6 +1052,8 @@ static void shrink_dentry_list(struct list_head *list) while (!list_empty(list)) { struct dentry *dentry, *parent; + cond_resched(); + dentry = list_entry(list->prev, struct dentry, d_lru); spin_lock(&dentry->d_lock); rcu_read_lock(); @@ -1191,7 +1207,6 @@ void shrink_dcache_sb(struct super_block *sb) this_cpu_sub(nr_dentry_unused, freed); shrink_dentry_list(&dispose); - cond_resched(); } while (list_lru_count(&sb->s_dentry_lru) > 0); } EXPORT_SYMBOL(shrink_dcache_sb); @@ -1473,7 +1488,6 @@ void shrink_dcache_parent(struct dentry *parent) break; shrink_dentry_list(&data.dispose); - cond_resched(); } } EXPORT_SYMBOL(shrink_dcache_parent); @@ -1600,7 +1614,6 @@ void d_invalidate(struct dentry *dentry) detach_mounts(data.mountpoint); dput(data.mountpoint); } - cond_resched(); } } EXPORT_SYMBOL(d_invalidate); @@ -1617,6 +1630,7 @@ EXPORT_SYMBOL(d_invalidate); struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) { + struct external_name *ext = NULL; struct dentry *dentry; char *dname; int err; @@ -1637,14 +1651,14 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) dname = dentry->d_iname; } else if (name->len > DNAME_INLINE_LEN-1) { size_t size = offsetof(struct external_name, name[1]); - struct external_name *p = kmalloc(size + name->len, - GFP_KERNEL_ACCOUNT); - if (!p) { + + ext = kmalloc(size + name->len, GFP_KERNEL_ACCOUNT); + if (!ext) { kmem_cache_free(dentry_cache, dentry); return NULL; } - atomic_set(&p->u.count, 1); - dname = p->name; + atomic_set(&ext->u.count, 1); + dname = ext->name; } else { dname = dentry->d_iname; } @@ -1683,6 +1697,12 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) } } + if (unlikely(ext)) { + pg_data_t *pgdat = page_pgdat(virt_to_page(ext)); + mod_node_page_state(pgdat, NR_INDIRECTLY_RECLAIMABLE_BYTES, + ksize(ext)); + } + this_cpu_inc(nr_dentry); return dentry; @@ -2770,7 +2790,7 @@ static void copy_name(struct dentry *dentry, struct dentry *target) dentry->d_name.hash_len = target->d_name.hash_len; } if (old_name && likely(atomic_dec_and_test(&old_name->u.count))) - kfree_rcu(old_name, u.head); + call_rcu(&old_name->u.head, __d_free_external_name); } /* diff --git a/fs/direct-io.c b/fs/direct-io.c index 1357ef563893..874607bb6e02 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -315,8 +315,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags) dio_warn_stale_pagecache(dio->iocb->ki_filp); } - if (!(dio->flags & DIO_SKIP_DIO_COUNT)) - inode_dio_end(dio->inode); + inode_dio_end(dio->inode); if (flags & DIO_COMPLETE_ASYNC) { /* @@ -1178,9 +1177,9 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, unsigned blkbits = i_blkbits; unsigned blocksize_mask = (1 << blkbits) - 1; ssize_t retval = -EINVAL; - size_t count = iov_iter_count(iter); + const size_t count = iov_iter_count(iter); loff_t offset = iocb->ki_pos; - loff_t end = offset + count; + const loff_t end = offset + count; struct dio *dio; struct dio_submit sdio = { 0, }; struct buffer_head map_bh = { 0, }; @@ -1201,7 +1200,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, } /* watch out for a 0 len io from a tricksy fs */ - if (iov_iter_rw(iter) == READ && !iov_iter_count(iter)) + if (iov_iter_rw(iter) == READ && !count) return 0; dio = kmem_cache_alloc(dio_cache, GFP_KERNEL); @@ -1252,8 +1251,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, */ if (is_sync_kiocb(iocb)) dio->is_async = false; - else if (!(dio->flags & DIO_ASYNC_EXTEND) && - iov_iter_rw(iter) == WRITE && end > i_size_read(inode)) + else if (iov_iter_rw(iter) == WRITE && end > i_size_read(inode)) dio->is_async = false; else dio->is_async = true; @@ -1297,8 +1295,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, /* * Will be decremented at I/O completion time. */ - if (!(dio->flags & DIO_SKIP_DIO_COUNT)) - inode_dio_begin(inode); + inode_dio_begin(inode); retval = 0; sdio.blkbits = blkbits; @@ -1318,8 +1315,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, dio->should_dirty = (iter->type == ITER_IOVEC); sdio.iter = iter; - sdio.final_block_in_request = - (offset + iov_iter_count(iter)) >> blkbits; + sdio.final_block_in_request = end >> blkbits; /* * In case of non-aligned buffers, we may need 2 more diff --git a/fs/exec.c b/fs/exec.c index 7eb8d21bcab9..183059c427b9 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -257,7 +257,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, * to work from. */ limit = _STK_LIM / 4 * 3; - limit = min(limit, rlimit(RLIMIT_STACK) / 4); + limit = min(limit, bprm->rlim_stack.rlim_cur / 4); if (size > limit) goto fail; } @@ -411,6 +411,11 @@ static int bprm_mm_init(struct linux_binprm *bprm) if (!mm) goto err; + /* Save current stack limit for all calculations made during exec. */ + task_lock(current->group_leader); + bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK]; + task_unlock(current->group_leader); + err = __bprm_mm_init(bprm); if (err) goto err; @@ -697,7 +702,7 @@ int setup_arg_pages(struct linux_binprm *bprm, #ifdef CONFIG_STACK_GROWSUP /* Limit stack size */ - stack_base = rlimit_max(RLIMIT_STACK); + stack_base = bprm->rlim_stack.rlim_max; if (stack_base > STACK_SIZE_MAX) stack_base = STACK_SIZE_MAX; @@ -770,7 +775,7 @@ int setup_arg_pages(struct linux_binprm *bprm, * Align this down to a page boundary as expand_stack * will align it up. */ - rlim_stack = rlimit(RLIMIT_STACK) & PAGE_MASK; + rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK; #ifdef CONFIG_STACK_GROWSUP if (stack_size + stack_expand > rlim_stack) stack_base = vma->vm_start + rlim_stack; @@ -895,13 +900,13 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size, if (!S_ISREG(file_inode(file)->i_mode) || max_size < 0) return -EINVAL; - ret = security_kernel_read_file(file, id); + ret = deny_write_access(file); if (ret) return ret; - ret = deny_write_access(file); + ret = security_kernel_read_file(file, id); if (ret) - return ret; + goto out; i_size = i_size_read(file_inode(file)); if (max_size > 0 && i_size > max_size) { @@ -1341,11 +1346,11 @@ void setup_new_exec(struct linux_binprm * bprm) * RLIMIT_STACK, but after the point of no return to avoid * needing to clean up the change on failure. */ - if (current->signal->rlim[RLIMIT_STACK].rlim_cur > _STK_LIM) - current->signal->rlim[RLIMIT_STACK].rlim_cur = _STK_LIM; + if (bprm->rlim_stack.rlim_cur > _STK_LIM) + bprm->rlim_stack.rlim_cur = _STK_LIM; } - arch_pick_mmap_layout(current->mm); + arch_pick_mmap_layout(current->mm, &bprm->rlim_stack); current->sas_ss_sp = current->sas_ss_size = 0; @@ -1378,6 +1383,16 @@ void setup_new_exec(struct linux_binprm * bprm) } EXPORT_SYMBOL(setup_new_exec); +/* Runs immediately before start_thread() takes over. */ +void finalize_exec(struct linux_binprm *bprm) +{ + /* Store any stack rlimit changes before starting thread. */ + task_lock(current->group_leader); + current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack; + task_unlock(current->group_leader); +} +EXPORT_SYMBOL(finalize_exec); + /* * Prepare credentials and lock ->cred_guard_mutex. * install_exec_creds() commits the new creds and drops the lock. diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 329a5d103846..645158dc33f1 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -435,6 +435,15 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, if (IS_ERR_OR_NULL(result)) return ERR_PTR(-ESTALE); + /* + * If no acceptance criteria was specified by caller, a disconnected + * dentry is also accepatable. Callers may use this mode to query if + * file handle is stale or to get a reference to an inode without + * risking the high overhead caused by directory reconnect. + */ + if (!acceptable) + return result; + if (d_is_dir(result)) { /* * This request is for a directory. diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 032295e1d386..cc40802ddfa8 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -814,6 +814,7 @@ extern const struct inode_operations ext2_file_inode_operations; extern const struct file_operations ext2_file_operations; /* inode.c */ +extern void ext2_set_file_ops(struct inode *inode); extern const struct address_space_operations ext2_aops; extern const struct address_space_operations ext2_nobh_aops; extern const struct iomap_ops ext2_iomap_ops; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 9b2ac55ac34f..1e01fabef130 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -940,9 +940,6 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) loff_t offset = iocb->ki_pos; ssize_t ret; - if (WARN_ON_ONCE(IS_DAX(inode))) - return -EIO; - ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block); if (ret < 0 && iov_iter_rw(iter) == WRITE) ext2_write_failed(mapping, offset + count); @@ -952,17 +949,16 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) static int ext2_writepages(struct address_space *mapping, struct writeback_control *wbc) { -#ifdef CONFIG_FS_DAX - if (dax_mapping(mapping)) { - return dax_writeback_mapping_range(mapping, - mapping->host->i_sb->s_bdev, - wbc); - } -#endif - return mpage_writepages(mapping, wbc, ext2_get_block); } +static int +ext2_dax_writepages(struct address_space *mapping, struct writeback_control *wbc) +{ + return dax_writeback_mapping_range(mapping, + mapping->host->i_sb->s_bdev, wbc); +} + const struct address_space_operations ext2_aops = { .readpage = ext2_readpage, .readpages = ext2_readpages, @@ -990,6 +986,13 @@ const struct address_space_operations ext2_nobh_aops = { .error_remove_page = generic_error_remove_page, }; +static const struct address_space_operations ext2_dax_aops = { + .writepages = ext2_dax_writepages, + .direct_IO = noop_direct_IO, + .set_page_dirty = noop_set_page_dirty, + .invalidatepage = noop_invalidatepage, +}; + /* * Probably it should be a library function... search for first non-zero word * or memcmp with zero_page, whatever is better for particular architecture. @@ -1388,6 +1391,18 @@ void ext2_set_inode_flags(struct inode *inode) inode->i_flags |= S_DAX; } +void ext2_set_file_ops(struct inode *inode) +{ + inode->i_op = &ext2_file_inode_operations; + inode->i_fop = &ext2_file_operations; + if (IS_DAX(inode)) + inode->i_mapping->a_ops = &ext2_dax_aops; + else if (test_opt(inode->i_sb, NOBH)) + inode->i_mapping->a_ops = &ext2_nobh_aops; + else + inode->i_mapping->a_ops = &ext2_aops; +} + struct inode *ext2_iget (struct super_block *sb, unsigned long ino) { struct ext2_inode_info *ei; @@ -1480,14 +1495,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) ei->i_data[n] = raw_inode->i_block[n]; if (S_ISREG(inode->i_mode)) { - inode->i_op = &ext2_file_inode_operations; - if (test_opt(inode->i_sb, NOBH)) { - inode->i_mapping->a_ops = &ext2_nobh_aops; - inode->i_fop = &ext2_file_operations; - } else { - inode->i_mapping->a_ops = &ext2_aops; - inode->i_fop = &ext2_file_operations; - } + ext2_set_file_ops(inode); } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &ext2_dir_inode_operations; inode->i_fop = &ext2_dir_operations; diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index e078075dc66f..55f7caadb093 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -107,14 +107,7 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode if (IS_ERR(inode)) return PTR_ERR(inode); - inode->i_op = &ext2_file_inode_operations; - if (test_opt(inode->i_sb, NOBH)) { - inode->i_mapping->a_ops = &ext2_nobh_aops; - inode->i_fop = &ext2_file_operations; - } else { - inode->i_mapping->a_ops = &ext2_aops; - inode->i_fop = &ext2_file_operations; - } + ext2_set_file_ops(inode); mark_inode_dirty(inode); return ext2_add_nondir(dentry, inode); } @@ -125,14 +118,7 @@ static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) if (IS_ERR(inode)) return PTR_ERR(inode); - inode->i_op = &ext2_file_inode_operations; - if (test_opt(inode->i_sb, NOBH)) { - inode->i_mapping->a_ops = &ext2_nobh_aops; - inode->i_fop = &ext2_file_operations; - } else { - inode->i_mapping->a_ops = &ext2_aops; - inode->i_fop = &ext2_file_operations; - } + ext2_set_file_ops(inode); mark_inode_dirty(inode); d_tmpfile(dentry, inode); unlock_new_inode(inode); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 7666c065b96f..de1694512f1f 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -827,7 +827,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) unsigned long logic_sb_block; unsigned long offset = 0; unsigned long def_mount_opts; - long ret = -EINVAL; + long ret = -ENOMEM; int blocksize = BLOCK_SIZE; int db_count; int i, j; @@ -835,7 +835,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) int err; struct ext2_mount_options opts; - err = -ENOMEM; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) goto failed; @@ -851,6 +850,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sbi->s_daxdev = dax_dev; spin_lock_init(&sbi->s_lock); + ret = -EINVAL; /* * See what the current blocksize for the device is, and diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 18aa2ef963ad..1e50c5efae67 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2716,12 +2716,6 @@ static int ext4_writepages(struct address_space *mapping, percpu_down_read(&sbi->s_journal_flag_rwsem); trace_ext4_writepages(inode, wbc); - if (dax_mapping(mapping)) { - ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, - wbc); - goto out_writepages; - } - /* * No pages to write? This is mainly a kludge to avoid starting * a transaction for special inodes like journal inode on last iput() @@ -2942,6 +2936,27 @@ out_writepages: return ret; } +static int ext4_dax_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + int ret; + long nr_to_write = wbc->nr_to_write; + struct inode *inode = mapping->host; + struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); + + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + return -EIO; + + percpu_down_read(&sbi->s_journal_flag_rwsem); + trace_ext4_writepages(inode, wbc); + + ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, wbc); + trace_ext4_writepages_result(inode, wbc, ret, + nr_to_write - wbc->nr_to_write); + percpu_up_read(&sbi->s_journal_flag_rwsem); + return ret; +} + static int ext4_nonda_switch(struct super_block *sb) { s64 free_clusters, dirty_clusters; @@ -3845,10 +3860,6 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (ext4_has_inline_data(inode)) return 0; - /* DAX uses iomap path now */ - if (WARN_ON_ONCE(IS_DAX(inode))) - return 0; - trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); if (iov_iter_rw(iter) == READ) ret = ext4_direct_IO_read(iocb, iter); @@ -3934,6 +3945,13 @@ static const struct address_space_operations ext4_da_aops = { .error_remove_page = generic_error_remove_page, }; +static const struct address_space_operations ext4_dax_aops = { + .writepages = ext4_dax_writepages, + .direct_IO = noop_direct_IO, + .set_page_dirty = noop_set_page_dirty, + .invalidatepage = noop_invalidatepage, +}; + void ext4_set_aops(struct inode *inode) { switch (ext4_inode_journal_mode(inode)) { @@ -3946,7 +3964,9 @@ void ext4_set_aops(struct inode *inode) default: BUG(); } - if (test_opt(inode->i_sb, DELALLOC)) + if (IS_DAX(inode)) + inode->i_mapping->a_ops = &ext4_dax_aops; + else if (test_opt(inode->i_sb, DELALLOC)) inode->i_mapping->a_ops = &ext4_da_aops; else inode->i_mapping->a_ops = &ext4_aops; @@ -5024,12 +5044,12 @@ static int other_inode_match(struct inode * inode, unsigned long ino, if ((inode->i_ino != ino) || (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW | - I_DIRTY_SYNC | I_DIRTY_DATASYNC)) || + I_DIRTY_INODE)) || ((inode->i_state & I_DIRTY_TIME) == 0)) return 0; spin_lock(&inode->i_lock); if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW | - I_DIRTY_SYNC | I_DIRTY_DATASYNC)) == 0) && + I_DIRTY_INODE)) == 0) && (inode->i_state & I_DIRTY_TIME)) { struct ext4_inode_info *ei = EXT4_I(inode); diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 512dca8abc7d..bf779461df13 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -68,6 +68,7 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, .old_blkaddr = index, .new_blkaddr = index, .encrypted_page = NULL, + .is_meta = is_meta, }; if (unlikely(!is_meta)) @@ -162,6 +163,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, .op_flags = sync ? (REQ_META | REQ_PRIO) : REQ_RAHEAD, .encrypted_page = NULL, .in_list = false, + .is_meta = (type != META_POR), }; struct blk_plug plug; @@ -569,13 +571,8 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) struct node_info ni; int err = acquire_orphan_inode(sbi); - if (err) { - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_msg(sbi->sb, KERN_WARNING, - "%s: orphan failed (ino=%x), run fsck to fix.", - __func__, ino); - return err; - } + if (err) + goto err_out; __add_ino_entry(sbi, ino, 0, ORPHAN_INO); @@ -589,6 +586,11 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) return PTR_ERR(inode); } + err = dquot_initialize(inode); + if (err) + goto err_out; + + dquot_initialize(inode); clear_nlink(inode); /* truncate all the data during iput */ @@ -598,14 +600,18 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) /* ENOMEM was fully retried in f2fs_evict_inode. */ if (ni.blk_addr != NULL_ADDR) { - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_msg(sbi->sb, KERN_WARNING, - "%s: orphan failed (ino=%x) by kernel, retry mount.", - __func__, ino); - return -EIO; + err = -EIO; + goto err_out; } __remove_ino_entry(sbi, ino, ORPHAN_INO); return 0; + +err_out: + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: orphan failed (ino=%x), run fsck to fix.", + __func__, ino); + return err; } int recover_orphan_inodes(struct f2fs_sb_info *sbi) @@ -1136,6 +1142,8 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (cpc->reason & CP_TRIMMED) __set_ckpt_flags(ckpt, CP_TRIMMED_FLAG); + else + __clear_ckpt_flags(ckpt, CP_TRIMMED_FLAG); if (cpc->reason & CP_UMOUNT) __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); @@ -1162,6 +1170,39 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) spin_unlock_irqrestore(&sbi->cp_lock, flags); } +static void commit_checkpoint(struct f2fs_sb_info *sbi, + void *src, block_t blk_addr) +{ + struct writeback_control wbc = { + .for_reclaim = 0, + }; + + /* + * pagevec_lookup_tag and lock_page again will take + * some extra time. Therefore, update_meta_pages and + * sync_meta_pages are combined in this function. + */ + struct page *page = grab_meta_page(sbi, blk_addr); + int err; + + memcpy(page_address(page), src, PAGE_SIZE); + set_page_dirty(page); + + f2fs_wait_on_page_writeback(page, META, true); + f2fs_bug_on(sbi, PageWriteback(page)); + if (unlikely(!clear_page_dirty_for_io(page))) + f2fs_bug_on(sbi, 1); + + /* writeout cp pack 2 page */ + err = __f2fs_write_meta_page(page, &wbc, FS_CP_META_IO); + f2fs_bug_on(sbi, err); + + f2fs_put_page(page, 0); + + /* submit checkpoint (with barrier if NOBARRIER is not set) */ + f2fs_submit_merged_write(sbi, META_FLUSH); +} + static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); @@ -1264,16 +1305,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) } } - /* need to wait for end_io results */ - wait_on_all_pages_writeback(sbi); - if (unlikely(f2fs_cp_error(sbi))) - return -EIO; - - /* flush all device cache */ - err = f2fs_flush_device_cache(sbi); - if (err) - return err; - /* write out checkpoint buffer at block 0 */ update_meta_page(sbi, ckpt, start_blk++); @@ -1301,26 +1332,26 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) start_blk += NR_CURSEG_NODE_TYPE; } - /* writeout checkpoint block */ - update_meta_page(sbi, ckpt, start_blk); + /* update user_block_counts */ + sbi->last_valid_block_count = sbi->total_valid_block_count; + percpu_counter_set(&sbi->alloc_valid_block_count, 0); + + /* Here, we have one bio having CP pack except cp pack 2 page */ + sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); - /* wait for previous submitted node/meta pages writeback */ + /* wait for previous submitted meta pages writeback */ wait_on_all_pages_writeback(sbi); if (unlikely(f2fs_cp_error(sbi))) return -EIO; - filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LLONG_MAX); - filemap_fdatawait_range(META_MAPPING(sbi), 0, LLONG_MAX); - - /* update user_block_counts */ - sbi->last_valid_block_count = sbi->total_valid_block_count; - percpu_counter_set(&sbi->alloc_valid_block_count, 0); - - /* Here, we only have one bio having CP pack */ - sync_meta_pages(sbi, META_FLUSH, LONG_MAX, FS_CP_META_IO); + /* flush all device cache */ + err = f2fs_flush_device_cache(sbi); + if (err) + return err; - /* wait for previous submitted meta pages writeback */ + /* barrier and flush checkpoint cp pack 2 page if it can */ + commit_checkpoint(sbi, ckpt, start_blk); wait_on_all_pages_writeback(sbi); release_ino_entry(sbi, false); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7578ed1a85e0..02237d4d91f5 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -175,15 +175,22 @@ static bool __same_bdev(struct f2fs_sb_info *sbi, */ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, struct writeback_control *wbc, - int npages, bool is_read) + int npages, bool is_read, + enum page_type type, enum temp_type temp) { struct bio *bio; bio = f2fs_bio_alloc(sbi, npages, true); f2fs_target_device(sbi, blk_addr, bio); - bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; - bio->bi_private = is_read ? NULL : sbi; + if (is_read) { + bio->bi_end_io = f2fs_read_end_io; + bio->bi_private = NULL; + } else { + bio->bi_end_io = f2fs_write_end_io; + bio->bi_private = sbi; + bio->bi_write_hint = io_type_to_rw_hint(sbi, type, temp); + } if (wbc) wbc_init_bio(wbc, bio); @@ -196,13 +203,12 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, if (!is_read_io(bio_op(bio))) { unsigned int start; - if (f2fs_sb_mounted_blkzoned(sbi->sb) && - current->plug && (type == DATA || type == NODE)) - blk_finish_plug(current->plug); - if (type != DATA && type != NODE) goto submit_io; + if (f2fs_sb_has_blkzoned(sbi->sb) && current->plug) + blk_finish_plug(current->plug); + start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS; start %= F2FS_IO_SIZE(sbi); @@ -377,12 +383,13 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) struct page *page = fio->encrypted_page ? fio->encrypted_page : fio->page; + verify_block_addr(fio, fio->new_blkaddr); trace_f2fs_submit_page_bio(page, fio); f2fs_trace_ios(fio, 0); /* Allocate a new bio */ bio = __bio_alloc(fio->sbi, fio->new_blkaddr, fio->io_wbc, - 1, is_read_io(fio->op)); + 1, is_read_io(fio->op), fio->type, fio->temp); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_put(bio); @@ -422,8 +429,8 @@ next: } if (fio->old_blkaddr != NEW_ADDR) - verify_block_addr(sbi, fio->old_blkaddr); - verify_block_addr(sbi, fio->new_blkaddr); + verify_block_addr(fio, fio->old_blkaddr); + verify_block_addr(fio, fio->new_blkaddr); bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; @@ -445,7 +452,8 @@ alloc_new: goto out_fail; } io->bio = __bio_alloc(sbi, fio->new_blkaddr, fio->io_wbc, - BIO_MAX_PAGES, false); + BIO_MAX_PAGES, false, + fio->type, fio->temp); io->fio = *fio; } @@ -832,13 +840,6 @@ alloc: return 0; } -static inline bool __force_buffered_io(struct inode *inode, int rw) -{ - return (f2fs_encrypted_file(inode) || - (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) || - F2FS_I_SB(inode)->s_ndevs); -} - int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); @@ -870,7 +871,7 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) if (direct_io) { map.m_seg_type = rw_hint_to_seg_type(iocb->ki_hint); - flag = __force_buffered_io(inode, WRITE) ? + flag = f2fs_force_buffered_io(inode, WRITE) ? F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO; goto map_blocks; @@ -1114,6 +1115,31 @@ out: return err; } +bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len) +{ + struct f2fs_map_blocks map; + block_t last_lblk; + int err; + + if (pos + len > i_size_read(inode)) + return false; + + map.m_lblk = F2FS_BYTES_TO_BLK(pos); + map.m_next_pgofs = NULL; + map.m_next_extent = NULL; + map.m_seg_type = NO_CHECK_TYPE; + last_lblk = F2FS_BLK_ALIGN(pos + len); + + while (map.m_lblk < last_lblk) { + map.m_len = last_lblk - map.m_lblk; + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT); + if (err || map.m_len == 0) + return false; + map.m_lblk += map.m_len; + } + return true; +} + static int __get_data_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create, int flag, pgoff_t *next_pgofs, int seg_type) @@ -2287,25 +2313,41 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); size_t count = iov_iter_count(iter); loff_t offset = iocb->ki_pos; int rw = iov_iter_rw(iter); int err; + enum rw_hint hint = iocb->ki_hint; + int whint_mode = F2FS_OPTION(sbi).whint_mode; err = check_direct_IO(inode, iter, offset); if (err) return err; - if (__force_buffered_io(inode, rw)) + if (f2fs_force_buffered_io(inode, rw)) return 0; trace_f2fs_direct_IO_enter(inode, offset, count, rw); - down_read(&F2FS_I(inode)->dio_rwsem[rw]); + if (rw == WRITE && whint_mode == WHINT_MODE_OFF) + iocb->ki_hint = WRITE_LIFE_NOT_SET; + + if (!down_read_trylock(&F2FS_I(inode)->dio_rwsem[rw])) { + if (iocb->ki_flags & IOCB_NOWAIT) { + iocb->ki_hint = hint; + err = -EAGAIN; + goto out; + } + down_read(&F2FS_I(inode)->dio_rwsem[rw]); + } + err = blockdev_direct_IO(iocb, inode, iter, get_data_block_dio); up_read(&F2FS_I(inode)->dio_rwsem[rw]); if (rw == WRITE) { + if (whint_mode == WHINT_MODE_OFF) + iocb->ki_hint = hint; if (err > 0) { f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO, err); @@ -2315,6 +2357,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } } +out: trace_f2fs_direct_IO_exit(inode, offset, count, rw, err); return err; @@ -2381,12 +2424,12 @@ void f2fs_set_page_dirty_nobuffers(struct page *page) SetPageDirty(page); spin_unlock(&mapping->private_lock); - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); WARN_ON_ONCE(!PageUptodate(page)); account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->page_tree, + radix_tree_tag_set(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); unlock_page_memcg(page); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index f00b5ed8c011..8c9c2f31b253 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -94,14 +94,12 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, struct f2fs_dir_entry *de; struct f2fs_dentry_ptr d; - dentry_blk = (struct f2fs_dentry_block *)kmap(dentry_page); + dentry_blk = (struct f2fs_dentry_block *)page_address(dentry_page); make_dentry_ptr_block(NULL, &d, dentry_blk); de = find_target_dentry(fname, namehash, max_slots, &d); if (de) *res_page = dentry_page; - else - kunmap(dentry_page); return de; } @@ -287,7 +285,6 @@ ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, de = f2fs_find_entry(dir, qstr, page); if (de) { res = le32_to_cpu(de->ino); - f2fs_dentry_kunmap(dir, *page); f2fs_put_page(*page, 0); } @@ -302,7 +299,6 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, f2fs_wait_on_page_writeback(page, type, true); de->ino = cpu_to_le32(inode->i_ino); set_de_type(de, inode->i_mode); - f2fs_dentry_kunmap(dir, page); set_page_dirty(page); dir->i_mtime = dir->i_ctime = current_time(dir); @@ -350,13 +346,11 @@ static int make_empty_dir(struct inode *inode, if (IS_ERR(dentry_page)) return PTR_ERR(dentry_page); - dentry_blk = kmap_atomic(dentry_page); + dentry_blk = page_address(dentry_page); make_dentry_ptr_block(NULL, &d, dentry_blk); do_make_empty_dir(inode, parent, &d); - kunmap_atomic(dentry_blk); - set_page_dirty(dentry_page); f2fs_put_page(dentry_page, 1); return 0; @@ -367,6 +361,7 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, struct page *dpage) { struct page *page; + int dummy_encrypt = DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(dir)); int err; if (is_inode_flag_set(inode, FI_NEW_INODE)) { @@ -393,7 +388,8 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, if (err) goto put_error; - if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) { + if ((f2fs_encrypted_inode(dir) || dummy_encrypt) && + f2fs_may_encrypt(inode)) { err = fscrypt_inherit_context(dir, inode, page, false); if (err) goto put_error; @@ -402,8 +398,6 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, page = get_node_page(F2FS_I_SB(dir), inode->i_ino); if (IS_ERR(page)) return page; - - set_cold_node(inode, page); } if (new_name) { @@ -547,13 +541,12 @@ start: if (IS_ERR(dentry_page)) return PTR_ERR(dentry_page); - dentry_blk = kmap(dentry_page); + dentry_blk = page_address(dentry_page); bit_pos = room_for_filename(&dentry_blk->dentry_bitmap, slots, NR_DENTRY_IN_BLOCK); if (bit_pos < NR_DENTRY_IN_BLOCK) goto add_dentry; - kunmap(dentry_page); f2fs_put_page(dentry_page, 1); } @@ -588,7 +581,6 @@ fail: if (inode) up_write(&F2FS_I(inode)->i_sem); - kunmap(dentry_page); f2fs_put_page(dentry_page, 1); return err; @@ -642,7 +634,6 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, F2FS_I(dir)->task = NULL; } if (de) { - f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); err = -EEXIST; } else if (IS_ERR(page)) { @@ -713,7 +704,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); - add_ino_entry(F2FS_I_SB(dir), dir->i_ino, TRANS_DIR_INO); + if (F2FS_OPTION(F2FS_I_SB(dir)).fsync_mode == FSYNC_MODE_STRICT) + add_ino_entry(F2FS_I_SB(dir), dir->i_ino, TRANS_DIR_INO); if (f2fs_has_inline_dentry(dir)) return f2fs_delete_inline_entry(dentry, page, dir, inode); @@ -730,7 +722,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, NR_DENTRY_IN_BLOCK, 0); - kunmap(page); /* kunmap - pair of f2fs_find_entry */ set_page_dirty(page); dir->i_ctime = dir->i_mtime = current_time(dir); @@ -741,10 +732,10 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, if (bit_pos == NR_DENTRY_IN_BLOCK && !truncate_hole(dir, page->index, page->index + 1)) { - spin_lock_irqsave(&mapping->tree_lock, flags); - radix_tree_tag_clear(&mapping->page_tree, page_index(page), + xa_lock_irqsave(&mapping->i_pages, flags); + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); clear_page_dirty_for_io(page); ClearPagePrivate(page); @@ -775,7 +766,7 @@ bool f2fs_empty_dir(struct inode *dir) return false; } - dentry_blk = kmap_atomic(dentry_page); + dentry_blk = page_address(dentry_page); if (bidx == 0) bit_pos = 2; else @@ -783,7 +774,6 @@ bool f2fs_empty_dir(struct inode *dir) bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, NR_DENTRY_IN_BLOCK, bit_pos); - kunmap_atomic(dentry_blk); f2fs_put_page(dentry_page, 1); @@ -901,19 +891,17 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) } } - dentry_blk = kmap(dentry_page); + dentry_blk = page_address(dentry_page); make_dentry_ptr_block(inode, &d, dentry_blk); err = f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr); if (err) { - kunmap(dentry_page); f2fs_put_page(dentry_page, 1); break; } - kunmap(dentry_page); f2fs_put_page(dentry_page, 1); } out_free: diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index ff2352a0ed15..d5a861bf2b42 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -460,7 +460,7 @@ static struct extent_node *__insert_extent_tree(struct inode *inode, struct rb_node *insert_parent) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct rb_node **p = &et->root.rb_node; + struct rb_node **p; struct rb_node *parent = NULL; struct extent_node *en = NULL; @@ -706,6 +706,9 @@ void f2fs_drop_extent_tree(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree *et = F2FS_I(inode)->extent_tree; + if (!f2fs_may_extent_tree(inode)) + return; + set_inode_flag(inode, FI_NO_EXTENT); write_lock(&et->lock); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6300ac5bcbe4..1df7f10476d6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -98,9 +98,10 @@ extern char *fault_name[FAULT_MAX]; #define F2FS_MOUNT_INLINE_XATTR_SIZE 0x00800000 #define F2FS_MOUNT_RESERVE_ROOT 0x01000000 -#define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option) -#define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option) -#define test_opt(sbi, option) ((sbi)->mount_opt.opt & F2FS_MOUNT_##option) +#define F2FS_OPTION(sbi) ((sbi)->mount_opt) +#define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) +#define set_opt(sbi, option) (F2FS_OPTION(sbi).opt |= F2FS_MOUNT_##option) +#define test_opt(sbi, option) (F2FS_OPTION(sbi).opt & F2FS_MOUNT_##option) #define ver_after(a, b) (typecheck(unsigned long long, a) && \ typecheck(unsigned long long, b) && \ @@ -113,7 +114,26 @@ typedef u32 block_t; /* typedef u32 nid_t; struct f2fs_mount_info { - unsigned int opt; + unsigned int opt; + int write_io_size_bits; /* Write IO size bits */ + block_t root_reserved_blocks; /* root reserved blocks */ + kuid_t s_resuid; /* reserved blocks for uid */ + kgid_t s_resgid; /* reserved blocks for gid */ + int active_logs; /* # of active logs */ + int inline_xattr_size; /* inline xattr size */ +#ifdef CONFIG_F2FS_FAULT_INJECTION + struct f2fs_fault_info fault_info; /* For fault injection */ +#endif +#ifdef CONFIG_QUOTA + /* Names of quota files with journalled quota */ + char *s_qf_names[MAXQUOTAS]; + int s_jquota_fmt; /* Format of quota to use */ +#endif + /* For which write hints are passed down to block layer */ + int whint_mode; + int alloc_mode; /* segment allocation policy */ + int fsync_mode; /* fsync policy */ + bool test_dummy_encryption; /* test dummy encryption */ }; #define F2FS_FEATURE_ENCRYPT 0x0001 @@ -125,6 +145,8 @@ struct f2fs_mount_info { #define F2FS_FEATURE_FLEXIBLE_INLINE_XATTR 0x0040 #define F2FS_FEATURE_QUOTA_INO 0x0080 #define F2FS_FEATURE_INODE_CRTIME 0x0100 +#define F2FS_FEATURE_LOST_FOUND 0x0200 +#define F2FS_FEATURE_VERITY 0x0400 /* reserved */ #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -450,7 +472,7 @@ static inline void make_dentry_ptr_block(struct inode *inode, d->inode = inode; d->max = NR_DENTRY_IN_BLOCK; d->nr_bitmap = SIZE_OF_DENTRY_BITMAP; - d->bitmap = &t->dentry_bitmap; + d->bitmap = t->dentry_bitmap; d->dentry = t->dentry; d->filename = t->filename; } @@ -576,6 +598,8 @@ enum { #define FADVISE_ENCRYPT_BIT 0x04 #define FADVISE_ENC_NAME_BIT 0x08 #define FADVISE_KEEP_SIZE_BIT 0x10 +#define FADVISE_HOT_BIT 0x20 +#define FADVISE_VERITY_BIT 0x40 /* reserved */ #define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) #define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) @@ -590,6 +614,9 @@ enum { #define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT) #define file_keep_isize(inode) is_file(inode, FADVISE_KEEP_SIZE_BIT) #define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT) +#define file_is_hot(inode) is_file(inode, FADVISE_HOT_BIT) +#define file_set_hot(inode) set_file(inode, FADVISE_HOT_BIT) +#define file_clear_hot(inode) clear_file(inode, FADVISE_HOT_BIT) #define DEF_DIR_LEVEL 0 @@ -637,6 +664,7 @@ struct f2fs_inode_info { kprojid_t i_projid; /* id for project quota */ int i_inline_xattr_size; /* inline xattr size */ struct timespec i_crtime; /* inode creation time */ + struct timespec i_disk_time[4]; /* inode disk times */ }; static inline void get_extent_info(struct extent_info *ext, @@ -743,7 +771,7 @@ struct f2fs_nm_info { unsigned int nid_cnt[MAX_NID_STATE]; /* the number of free node id */ spinlock_t nid_list_lock; /* protect nid lists ops */ struct mutex build_lock; /* lock for build free nids */ - unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE]; + unsigned char **free_nid_bitmap; unsigned char *nat_block_bitmap; unsigned short *free_nid_count; /* free nid count of NAT block */ @@ -976,6 +1004,7 @@ struct f2fs_io_info { bool submitted; /* indicate IO submission */ int need_lock; /* indicate we need to lock cp_rwsem */ bool in_list; /* indicate fio is in io_list */ + bool is_meta; /* indicate borrow meta inode mapping or not */ enum iostat_type io_type; /* io type */ struct writeback_control *io_wbc; /* writeback control */ }; @@ -1037,10 +1066,34 @@ enum { MAX_TIME, }; +enum { + WHINT_MODE_OFF, /* not pass down write hints */ + WHINT_MODE_USER, /* try to pass down hints given by users */ + WHINT_MODE_FS, /* pass down hints with F2FS policy */ +}; + +enum { + ALLOC_MODE_DEFAULT, /* stay default */ + ALLOC_MODE_REUSE, /* reuse segments as much as possible */ +}; + +enum fsync_mode { + FSYNC_MODE_POSIX, /* fsync follows posix semantics */ + FSYNC_MODE_STRICT, /* fsync behaves in line with ext4 */ +}; + +#ifdef CONFIG_F2FS_FS_ENCRYPTION +#define DUMMY_ENCRYPTION_ENABLED(sbi) \ + (unlikely(F2FS_OPTION(sbi).test_dummy_encryption)) +#else +#define DUMMY_ENCRYPTION_ENABLED(sbi) (0) +#endif + struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ struct f2fs_super_block *raw_super; /* raw super block pointer */ + struct rw_semaphore sb_lock; /* lock for raw super block */ int valid_super_block; /* valid super block no */ unsigned long s_flag; /* flags for sbi */ @@ -1060,7 +1113,6 @@ struct f2fs_sb_info { struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */ struct mutex wio_mutex[NR_PAGE_TYPE - 1][NR_TEMP_TYPE]; /* bio ordering for NODE/DATA */ - int write_io_size_bits; /* Write IO size bits */ mempool_t *write_io_dummy; /* Dummy pages */ /* for checkpoint */ @@ -1110,9 +1162,7 @@ struct f2fs_sb_info { unsigned int total_node_count; /* total node block count */ unsigned int total_valid_node_count; /* valid node block count */ loff_t max_file_blocks; /* max block index of file */ - int active_logs; /* # of active logs */ int dir_level; /* directory level */ - int inline_xattr_size; /* inline xattr size */ unsigned int trigger_ssr_threshold; /* threshold to trigger ssr */ int readdir_ra; /* readahead inode in readdir */ @@ -1122,9 +1172,6 @@ struct f2fs_sb_info { block_t last_valid_block_count; /* for recovery */ block_t reserved_blocks; /* configurable reserved blocks */ block_t current_reserved_blocks; /* current reserved blocks */ - block_t root_reserved_blocks; /* root reserved blocks */ - kuid_t s_resuid; /* reserved blocks for uid */ - kgid_t s_resgid; /* reserved blocks for gid */ unsigned int nquota_files; /* # of quota sysfile */ @@ -1209,17 +1256,6 @@ struct f2fs_sb_info { /* Precomputed FS UUID checksum for seeding other checksums */ __u32 s_chksum_seed; - - /* For fault injection */ -#ifdef CONFIG_F2FS_FAULT_INJECTION - struct f2fs_fault_info fault_info; -#endif - -#ifdef CONFIG_QUOTA - /* Names of quota files with journalled quota */ - char *s_qf_names[MAXQUOTAS]; - int s_jquota_fmt; /* Format of quota to use */ -#endif }; #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -1229,7 +1265,7 @@ struct f2fs_sb_info { __func__, __builtin_return_address(0)) static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) { - struct f2fs_fault_info *ffi = &sbi->fault_info; + struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info; if (!ffi->inject_rate) return false; @@ -1586,12 +1622,12 @@ static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi, return false; if (IS_NOQUOTA(inode)) return true; - if (capable(CAP_SYS_RESOURCE)) + if (uid_eq(F2FS_OPTION(sbi).s_resuid, current_fsuid())) return true; - if (uid_eq(sbi->s_resuid, current_fsuid())) + if (!gid_eq(F2FS_OPTION(sbi).s_resgid, GLOBAL_ROOT_GID) && + in_group_p(F2FS_OPTION(sbi).s_resgid)) return true; - if (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && - in_group_p(sbi->s_resgid)) + if (capable(CAP_SYS_RESOURCE)) return true; return false; } @@ -1627,7 +1663,7 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, sbi->current_reserved_blocks; if (!__allow_reserved_blocks(sbi, inode)) - avail_user_block_count -= sbi->root_reserved_blocks; + avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks; if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { diff = sbi->total_valid_block_count - avail_user_block_count; @@ -1762,6 +1798,12 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); int offset; + if (is_set_ckpt_flags(sbi, CP_LARGE_NAT_BITMAP_FLAG)) { + offset = (flag == SIT_BITMAP) ? + le32_to_cpu(ckpt->nat_ver_bitmap_bytesize) : 0; + return &ckpt->sit_nat_version_bitmap + offset; + } + if (__cp_payload(sbi) > 0) { if (flag == NAT_BITMAP) return &ckpt->sit_nat_version_bitmap; @@ -1828,7 +1870,7 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, sbi->current_reserved_blocks + 1; if (!__allow_reserved_blocks(sbi, inode)) - valid_block_count += sbi->root_reserved_blocks; + valid_block_count += F2FS_OPTION(sbi).root_reserved_blocks; if (unlikely(valid_block_count > sbi->user_block_count)) { spin_unlock(&sbi->stat_lock); @@ -2399,12 +2441,6 @@ static inline int f2fs_has_inline_dentry(struct inode *inode) return is_inode_flag_set(inode, FI_INLINE_DENTRY); } -static inline void f2fs_dentry_kunmap(struct inode *dir, struct page *page) -{ - if (!f2fs_has_inline_dentry(dir)) - kunmap(page); -} - static inline int is_file(struct inode *inode, int type) { return F2FS_I(inode)->i_advise & type; @@ -2436,7 +2472,17 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) } if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) || file_keep_isize(inode) || - i_size_read(inode) & PAGE_MASK) + i_size_read(inode) & ~PAGE_MASK) + return false; + + if (!timespec_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime)) + return false; + if (!timespec_equal(F2FS_I(inode)->i_disk_time + 1, &inode->i_ctime)) + return false; + if (!timespec_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime)) + return false; + if (!timespec_equal(F2FS_I(inode)->i_disk_time + 3, + &F2FS_I(inode)->i_crtime)) return false; down_read(&F2FS_I(inode)->i_sem); @@ -2446,9 +2492,9 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) return ret; } -static inline int f2fs_readonly(struct super_block *sb) +static inline bool f2fs_readonly(struct super_block *sb) { - return sb->s_flags & SB_RDONLY; + return sb_rdonly(sb); } static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi) @@ -2596,6 +2642,8 @@ void handle_failed_inode(struct inode *inode); /* * namei.c */ +int update_extension_list(struct f2fs_sb_info *sbi, const char *name, + bool hot, bool set); struct dentry *f2fs_get_parent(struct dentry *child); /* @@ -2768,6 +2816,8 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi); int __init create_segment_manager_caches(void); void destroy_segment_manager_caches(void); int rw_hint_to_seg_type(enum rw_hint hint); +enum rw_hint io_type_to_rw_hint(struct f2fs_sb_info *sbi, enum page_type type, + enum temp_type temp); /* * checkpoint.c @@ -2850,6 +2900,7 @@ int f2fs_release_page(struct page *page, gfp_t wait); int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode); #endif +bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len); /* * gc.c @@ -3172,45 +3223,21 @@ static inline bool f2fs_bio_encrypted(struct bio *bio) return bio->bi_private != NULL; } -static inline int f2fs_sb_has_crypto(struct super_block *sb) -{ - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT); -} - -static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) -{ - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED); -} - -static inline int f2fs_sb_has_extra_attr(struct super_block *sb) -{ - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_EXTRA_ATTR); -} - -static inline int f2fs_sb_has_project_quota(struct super_block *sb) -{ - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_PRJQUOTA); -} - -static inline int f2fs_sb_has_inode_chksum(struct super_block *sb) -{ - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CHKSUM); -} - -static inline int f2fs_sb_has_flexible_inline_xattr(struct super_block *sb) -{ - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_FLEXIBLE_INLINE_XATTR); -} - -static inline int f2fs_sb_has_quota_ino(struct super_block *sb) -{ - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_QUOTA_INO); +#define F2FS_FEATURE_FUNCS(name, flagname) \ +static inline int f2fs_sb_has_##name(struct super_block *sb) \ +{ \ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_##flagname); \ } -static inline int f2fs_sb_has_inode_crtime(struct super_block *sb) -{ - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CRTIME); -} +F2FS_FEATURE_FUNCS(encrypt, ENCRYPT); +F2FS_FEATURE_FUNCS(blkzoned, BLKZONED); +F2FS_FEATURE_FUNCS(extra_attr, EXTRA_ATTR); +F2FS_FEATURE_FUNCS(project_quota, PRJQUOTA); +F2FS_FEATURE_FUNCS(inode_chksum, INODE_CHKSUM); +F2FS_FEATURE_FUNCS(flexible_inline_xattr, FLEXIBLE_INLINE_XATTR); +F2FS_FEATURE_FUNCS(quota_ino, QUOTA_INO); +F2FS_FEATURE_FUNCS(inode_crtime, INODE_CRTIME); +F2FS_FEATURE_FUNCS(lost_found, LOST_FOUND); #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, @@ -3230,7 +3257,7 @@ static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) { struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); - return blk_queue_discard(q) || f2fs_sb_mounted_blkzoned(sbi->sb); + return blk_queue_discard(q) || f2fs_sb_has_blkzoned(sbi->sb); } static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt) @@ -3259,4 +3286,11 @@ static inline bool f2fs_may_encrypt(struct inode *inode) #endif } +static inline bool f2fs_force_buffered_io(struct inode *inode, int rw) +{ + return (f2fs_encrypted_file(inode) || + (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) || + F2FS_I_SB(inode)->s_ndevs); +} + #endif diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 672a542e5464..6b94f19b3fa8 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -163,9 +163,10 @@ static inline enum cp_reason_type need_do_checkpoint(struct inode *inode) cp_reason = CP_NODE_NEED_CP; else if (test_opt(sbi, FASTBOOT)) cp_reason = CP_FASTBOOT_MODE; - else if (sbi->active_logs == 2) + else if (F2FS_OPTION(sbi).active_logs == 2) cp_reason = CP_SPEC_LOG_NUM; - else if (need_dentry_mark(sbi, inode->i_ino) && + else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT && + need_dentry_mark(sbi, inode->i_ino) && exist_written_data(sbi, F2FS_I(inode)->i_pino, TRANS_DIR_INO)) cp_reason = CP_RECOVER_DIR; @@ -479,6 +480,9 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) if (err) return err; + + filp->f_mode |= FMODE_NOWAIT; + return dquot_file_open(inode, filp); } @@ -569,7 +573,6 @@ truncate_out: int truncate_blocks(struct inode *inode, u64 from, bool lock) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - unsigned int blocksize = inode->i_sb->s_blocksize; struct dnode_of_data dn; pgoff_t free_from; int count = 0, err = 0; @@ -578,7 +581,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) trace_f2fs_truncate_blocks_enter(inode, from); - free_from = (pgoff_t)F2FS_BYTES_TO_BLK(from + blocksize - 1); + free_from = (pgoff_t)F2FS_BLK_ALIGN(from); if (free_from >= sbi->max_file_blocks) goto free_partial; @@ -1348,8 +1351,12 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, } out: - if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) - f2fs_i_size_write(inode, new_size); + if (new_size > i_size_read(inode)) { + if (mode & FALLOC_FL_KEEP_SIZE) + file_set_keep_isize(inode); + else + f2fs_i_size_write(inode, new_size); + } out_sem: up_write(&F2FS_I(inode)->i_mmap_sem); @@ -1711,6 +1718,8 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) inode_lock(inode); + down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + if (f2fs_is_volatile_file(inode)) goto err_out; @@ -1729,6 +1738,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); } err_out: + up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -1938,7 +1948,7 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); - if (!f2fs_sb_has_crypto(inode->i_sb)) + if (!f2fs_sb_has_encrypt(inode->i_sb)) return -EOPNOTSUPP; f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); @@ -1948,7 +1958,7 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) { - if (!f2fs_sb_has_crypto(file_inode(filp)->i_sb)) + if (!f2fs_sb_has_encrypt(file_inode(filp)->i_sb)) return -EOPNOTSUPP; return fscrypt_ioctl_get_policy(filp, (void __user *)arg); } @@ -1959,16 +1969,18 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int err; - if (!f2fs_sb_has_crypto(inode->i_sb)) + if (!f2fs_sb_has_encrypt(inode->i_sb)) return -EOPNOTSUPP; - if (uuid_is_nonzero(sbi->raw_super->encrypt_pw_salt)) - goto got_it; - err = mnt_want_write_file(filp); if (err) return err; + down_write(&sbi->sb_lock); + + if (uuid_is_nonzero(sbi->raw_super->encrypt_pw_salt)) + goto got_it; + /* update superblock with uuid */ generate_random_uuid(sbi->raw_super->encrypt_pw_salt); @@ -1976,15 +1988,16 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) if (err) { /* undo new data */ memset(sbi->raw_super->encrypt_pw_salt, 0, 16); - mnt_drop_write_file(filp); - return err; + goto out_err; } - mnt_drop_write_file(filp); got_it: if (copy_to_user((__u8 __user *)arg, sbi->raw_super->encrypt_pw_salt, 16)) - return -EFAULT; - return 0; + err = -EFAULT; +out_err: + up_write(&sbi->sb_lock); + mnt_drop_write_file(filp); + return err; } static int f2fs_ioc_gc(struct file *filp, unsigned long arg) @@ -2045,8 +2058,10 @@ static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) return ret; end = range.start + range.len; - if (range.start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi)) - return -EINVAL; + if (range.start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi)) { + ret = -EINVAL; + goto out; + } do_more: if (!range.sync) { if (!mutex_trylock(&sbi->gc_mutex)) { @@ -2885,25 +2900,54 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; - inode_lock(inode); + if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) + return -EINVAL; + + if (!inode_trylock(inode)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + inode_lock(inode); + } + ret = generic_write_checks(iocb, from); if (ret > 0) { + bool preallocated = false; + size_t target_size = 0; int err; if (iov_iter_fault_in_readable(from, iov_iter_count(from))) set_inode_flag(inode, FI_NO_PREALLOC); - err = f2fs_preallocate_blocks(iocb, from); - if (err) { - clear_inode_flag(inode, FI_NO_PREALLOC); - inode_unlock(inode); - return err; + if ((iocb->ki_flags & IOCB_NOWAIT) && + (iocb->ki_flags & IOCB_DIRECT)) { + if (!f2fs_overwrite_io(inode, iocb->ki_pos, + iov_iter_count(from)) || + f2fs_has_inline_data(inode) || + f2fs_force_buffered_io(inode, WRITE)) { + inode_unlock(inode); + return -EAGAIN; + } + + } else { + preallocated = true; + target_size = iocb->ki_pos + iov_iter_count(from); + + err = f2fs_preallocate_blocks(iocb, from); + if (err) { + clear_inode_flag(inode, FI_NO_PREALLOC); + inode_unlock(inode); + return err; + } } blk_start_plug(&plug); ret = __generic_file_write_iter(iocb, from); blk_finish_plug(&plug); clear_inode_flag(inode, FI_NO_PREALLOC); + /* if we couldn't write data, we should deallocate blocks. */ + if (preallocated && i_size_read(inode) < target_size) + f2fs_truncate(inode); + if (ret > 0) f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index aa720cc44509..9327411fd93b 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -76,14 +76,15 @@ static int gc_thread_func(void *data) * invalidated soon after by user update or deletion. * So, I'd like to wait some time to collect dirty segments. */ - if (!mutex_trylock(&sbi->gc_mutex)) - goto next; - if (gc_th->gc_urgent) { wait_ms = gc_th->urgent_sleep_time; + mutex_lock(&sbi->gc_mutex); goto do_gc; } + if (!mutex_trylock(&sbi->gc_mutex)) + goto next; + if (!is_idle(sbi)) { increase_sleep_time(gc_th, &wait_ms); mutex_unlock(&sbi->gc_mutex); @@ -161,12 +162,17 @@ static int select_gc_type(struct f2fs_gc_kthread *gc_th, int gc_type) { int gc_mode = (gc_type == BG_GC) ? GC_CB : GC_GREEDY; - if (gc_th && gc_th->gc_idle) { + if (!gc_th) + return gc_mode; + + if (gc_th->gc_idle) { if (gc_th->gc_idle == 1) gc_mode = GC_CB; else if (gc_th->gc_idle == 2) gc_mode = GC_GREEDY; } + if (gc_th->gc_urgent) + gc_mode = GC_GREEDY; return gc_mode; } @@ -188,11 +194,14 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, } /* we need to check every dirty segments in the FG_GC case */ - if (gc_type != FG_GC && p->max_search > sbi->max_victim_search) + if (gc_type != FG_GC && + (sbi->gc_thread && !sbi->gc_thread->gc_urgent) && + p->max_search > sbi->max_victim_search) p->max_search = sbi->max_victim_search; - /* let's select beginning hot/small space first */ - if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) + /* let's select beginning hot/small space first in no_heap mode*/ + if (test_opt(sbi, NOHEAP) && + (type == CURSEG_HOT_DATA || IS_NODESEG(type))) p->offset = 0; else p->offset = SIT_I(sbi)->last_victim[p->gc_mode]; @@ -1006,7 +1015,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, unsigned int init_segno = segno; struct gc_inode_list gc_list = { .ilist = LIST_HEAD_INIT(gc_list.ilist), - .iroot = RADIX_TREE_INIT(GFP_NOFS), + .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; trace_f2fs_gc_begin(sbi->sb, sync, background, diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 90e38d8ea688..265da200daa8 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -226,10 +226,10 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) kunmap_atomic(src_addr); set_page_dirty(dn.inode_page); - spin_lock_irqsave(&mapping->tree_lock, flags); - radix_tree_tag_clear(&mapping->page_tree, page_index(page), + xa_lock_irqsave(&mapping->i_pages, flags); + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); set_inode_flag(inode, FI_APPEND_WRITE); set_inode_flag(inode, FI_DATA_EXIST); @@ -369,7 +369,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, f2fs_wait_on_page_writeback(page, DATA, true); zero_user_segment(page, MAX_INLINE_DATA(dir), PAGE_SIZE); - dentry_blk = kmap_atomic(page); + dentry_blk = page_address(page); make_dentry_ptr_inline(dir, &src, inline_dentry); make_dentry_ptr_block(dir, &dst, dentry_blk); @@ -386,7 +386,6 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, memcpy(dst.dentry, src.dentry, SIZE_OF_DIR_ENTRY * src.max); memcpy(dst.filename, src.filename, src.max * F2FS_SLOT_LEN); - kunmap_atomic(dentry_blk); if (!PageUptodate(page)) SetPageUptodate(page); set_page_dirty(page); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 205add3d0f3a..e0d9e8f27ed2 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -284,6 +284,10 @@ static int do_read_inode(struct inode *inode) fi->i_crtime.tv_nsec = le32_to_cpu(ri->i_crtime_nsec); } + F2FS_I(inode)->i_disk_time[0] = inode->i_atime; + F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; + F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; + F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); @@ -328,7 +332,7 @@ make_now: inode->i_op = &f2fs_dir_inode_operations; inode->i_fop = &f2fs_dir_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO); + inode_nohighmem(inode); } else if (S_ISLNK(inode->i_mode)) { if (f2fs_encrypted_inode(inode)) inode->i_op = &f2fs_encrypted_symlink_inode_operations; @@ -439,12 +443,15 @@ void update_inode(struct inode *inode, struct page *node_page) } __set_inode_rdev(inode, ri); - set_cold_node(inode, node_page); /* deleted inode */ if (inode->i_nlink == 0) clear_inline_node(node_page); + F2FS_I(inode)->i_disk_time[0] = inode->i_atime; + F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; + F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; + F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; } void update_inode_page(struct inode *inode) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index b68e7b03959f..d5098efe577c 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -78,7 +78,8 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) set_inode_flag(inode, FI_NEW_INODE); /* If the directory encrypted, then we should encrypt the inode. */ - if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) + if ((f2fs_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) && + f2fs_may_encrypt(inode)) f2fs_set_encrypted_inode(inode); if (f2fs_sb_has_extra_attr(sbi->sb)) { @@ -97,7 +98,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) { f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode)); if (f2fs_has_inline_xattr(inode)) - xattr_size = sbi->inline_xattr_size; + xattr_size = F2FS_OPTION(sbi).inline_xattr_size; /* Otherwise, will be 0 */ } else if (f2fs_has_inline_xattr(inode) || f2fs_has_inline_dentry(inode)) { @@ -142,7 +143,7 @@ fail_drop: return ERR_PTR(err); } -static int is_multimedia_file(const unsigned char *s, const char *sub) +static int is_extension_exist(const unsigned char *s, const char *sub) { size_t slen = strlen(s); size_t sublen = strlen(sub); @@ -168,19 +169,94 @@ static int is_multimedia_file(const unsigned char *s, const char *sub) /* * Set multimedia files as cold files for hot/cold data separation */ -static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode, +static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode, const unsigned char *name) { - int i; - __u8 (*extlist)[8] = sbi->raw_super->extension_list; + __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; + int i, cold_count, hot_count; + + down_read(&sbi->sb_lock); + + cold_count = le32_to_cpu(sbi->raw_super->extension_count); + hot_count = sbi->raw_super->hot_ext_count; - int count = le32_to_cpu(sbi->raw_super->extension_count); - for (i = 0; i < count; i++) { - if (is_multimedia_file(name, extlist[i])) { + for (i = 0; i < cold_count + hot_count; i++) { + if (!is_extension_exist(name, extlist[i])) + continue; + if (i < cold_count) file_set_cold(inode); - break; - } + else + file_set_hot(inode); + break; } + + up_read(&sbi->sb_lock); +} + +int update_extension_list(struct f2fs_sb_info *sbi, const char *name, + bool hot, bool set) +{ + __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; + int cold_count = le32_to_cpu(sbi->raw_super->extension_count); + int hot_count = sbi->raw_super->hot_ext_count; + int total_count = cold_count + hot_count; + int start, count; + int i; + + if (set) { + if (total_count == F2FS_MAX_EXTENSION) + return -EINVAL; + } else { + if (!hot && !cold_count) + return -EINVAL; + if (hot && !hot_count) + return -EINVAL; + } + + if (hot) { + start = cold_count; + count = total_count; + } else { + start = 0; + count = cold_count; + } + + for (i = start; i < count; i++) { + if (strcmp(name, extlist[i])) + continue; + + if (set) + return -EINVAL; + + memcpy(extlist[i], extlist[i + 1], + F2FS_EXTENSION_LEN * (total_count - i - 1)); + memset(extlist[total_count - 1], 0, F2FS_EXTENSION_LEN); + if (hot) + sbi->raw_super->hot_ext_count = hot_count - 1; + else + sbi->raw_super->extension_count = + cpu_to_le32(cold_count - 1); + return 0; + } + + if (!set) + return -EINVAL; + + if (hot) { + strncpy(extlist[count], name, strlen(name)); + sbi->raw_super->hot_ext_count = hot_count + 1; + } else { + char buf[F2FS_MAX_EXTENSION][F2FS_EXTENSION_LEN]; + + memcpy(buf, &extlist[cold_count], + F2FS_EXTENSION_LEN * hot_count); + memset(extlist[cold_count], 0, F2FS_EXTENSION_LEN); + strncpy(extlist[cold_count], name, strlen(name)); + memcpy(&extlist[cold_count + 1], buf, + F2FS_EXTENSION_LEN * hot_count); + sbi->raw_super->extension_count = cpu_to_le32(cold_count + 1); + } + return 0; } static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, @@ -203,7 +279,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, return PTR_ERR(inode); if (!test_opt(sbi, DISABLE_EXT_IDENTIFY)) - set_cold_files(sbi, inode, dentry->d_name.name); + set_file_temperature(sbi, inode, dentry->d_name.name); inode->i_op = &f2fs_file_inode_operations; inode->i_fop = &f2fs_file_operations; @@ -317,7 +393,6 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) de = f2fs_find_entry(dir, &dot, &page); if (de) { - f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); } else if (IS_ERR(page)) { err = PTR_ERR(page); @@ -329,14 +404,12 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) } de = f2fs_find_entry(dir, &dotdot, &page); - if (de) { - f2fs_dentry_kunmap(dir, page); + if (de) f2fs_put_page(page, 0); - } else if (IS_ERR(page)) { + else if (IS_ERR(page)) err = PTR_ERR(page); - } else { + else err = __f2fs_add_link(dir, &dotdot, NULL, pino, S_IFDIR); - } out: if (!err) clear_inode_flag(dir, FI_INLINE_DOTS); @@ -377,7 +450,6 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, } ino = le32_to_cpu(de->ino); - f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); inode = f2fs_iget(dir->i_sb, ino); @@ -452,7 +524,6 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) err = acquire_orphan_inode(sbi); if (err) { f2fs_unlock_op(sbi); - f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); goto fail; } @@ -579,7 +650,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_op = &f2fs_dir_inode_operations; inode->i_fop = &f2fs_dir_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO); + inode_nohighmem(inode); set_inode_flag(inode, FI_INC_LINK); f2fs_lock_op(sbi); @@ -717,10 +788,12 @@ out: static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) { - if (unlikely(f2fs_cp_error(F2FS_I_SB(dir)))) + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + + if (unlikely(f2fs_cp_error(sbi))) return -EIO; - if (f2fs_encrypted_inode(dir)) { + if (f2fs_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) { int err = fscrypt_get_encryption_info(dir); if (err) return err; @@ -893,16 +966,15 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } if (old_dir_entry) { - if (old_dir != new_dir && !whiteout) { + if (old_dir != new_dir && !whiteout) f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); - } else { - f2fs_dentry_kunmap(old_inode, old_dir_page); + else f2fs_put_page(old_dir_page, 0); - } f2fs_i_links_write(old_dir, false); } - add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); + if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) + add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); f2fs_unlock_op(sbi); @@ -912,20 +984,15 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, put_out_dir: f2fs_unlock_op(sbi); - if (new_page) { - f2fs_dentry_kunmap(new_dir, new_page); + if (new_page) f2fs_put_page(new_page, 0); - } out_whiteout: if (whiteout) iput(whiteout); out_dir: - if (old_dir_entry) { - f2fs_dentry_kunmap(old_inode, old_dir_page); + if (old_dir_entry) f2fs_put_page(old_dir_page, 0); - } out_old: - f2fs_dentry_kunmap(old_dir, old_page); f2fs_put_page(old_page, 0); out: return err; @@ -1057,8 +1124,10 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, } f2fs_mark_inode_dirty_sync(new_dir, false); - add_ino_entry(sbi, old_dir->i_ino, TRANS_DIR_INO); - add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); + if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) { + add_ino_entry(sbi, old_dir->i_ino, TRANS_DIR_INO); + add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); + } f2fs_unlock_op(sbi); @@ -1067,19 +1136,15 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, return 0; out_new_dir: if (new_dir_entry) { - f2fs_dentry_kunmap(new_inode, new_dir_page); f2fs_put_page(new_dir_page, 0); } out_old_dir: if (old_dir_entry) { - f2fs_dentry_kunmap(old_inode, old_dir_page); f2fs_put_page(old_dir_page, 0); } out_new: - f2fs_dentry_kunmap(new_dir, new_page); f2fs_put_page(new_page, 0); out_old: - f2fs_dentry_kunmap(old_dir, old_page); f2fs_put_page(old_page, 0); out: return err; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 177c438e4a56..f202398e20ea 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -91,11 +91,11 @@ static void clear_node_page_dirty(struct page *page) unsigned int long flags; if (PageDirty(page)) { - spin_lock_irqsave(&mapping->tree_lock, flags); - radix_tree_tag_clear(&mapping->page_tree, + xa_lock_irqsave(&mapping->i_pages, flags); + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); clear_page_dirty_for_io(page); dec_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES); @@ -193,8 +193,8 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e) __free_nat_entry(e); } -static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, - struct nat_entry *ne) +static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i, + struct nat_entry *ne) { nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid); struct nat_entry_set *head; @@ -209,15 +209,36 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, head->entry_cnt = 0; f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head); } + return head; +} + +static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, + struct nat_entry *ne) +{ + struct nat_entry_set *head; + bool new_ne = nat_get_blkaddr(ne) == NEW_ADDR; + + if (!new_ne) + head = __grab_nat_entry_set(nm_i, ne); + + /* + * update entry_cnt in below condition: + * 1. update NEW_ADDR to valid block address; + * 2. update old block address to new one; + */ + if (!new_ne && (get_nat_flag(ne, IS_PREALLOC) || + !get_nat_flag(ne, IS_DIRTY))) + head->entry_cnt++; + + set_nat_flag(ne, IS_PREALLOC, new_ne); if (get_nat_flag(ne, IS_DIRTY)) goto refresh_list; nm_i->dirty_nat_cnt++; - head->entry_cnt++; set_nat_flag(ne, IS_DIRTY, true); refresh_list: - if (nat_get_blkaddr(ne) == NEW_ADDR) + if (new_ne) list_del_init(&ne->list); else list_move_tail(&ne->list, &head->entry_list); @@ -1076,7 +1097,7 @@ struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs) f2fs_wait_on_page_writeback(page, NODE, true); fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); - set_cold_node(dn->inode, page); + set_cold_node(page, S_ISDIR(dn->inode->i_mode)); if (!PageUptodate(page)) SetPageUptodate(page); if (set_page_dirty(page)) @@ -1140,7 +1161,7 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) f2fs_bug_on(sbi, check_nid_range(sbi, nid)); rcu_read_lock(); - apage = radix_tree_lookup(&NODE_MAPPING(sbi)->page_tree, nid); + apage = radix_tree_lookup(&NODE_MAPPING(sbi)->i_pages, nid); rcu_read_unlock(); if (apage) return; @@ -2291,6 +2312,7 @@ retry: if (!PageUptodate(ipage)) SetPageUptodate(ipage); fill_node_footer(ipage, ino, ino, 0, true); + set_cold_node(page, false); src = F2FS_INODE(page); dst = F2FS_INODE(ipage); @@ -2580,8 +2602,7 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) if (!enabled_nat_bits(sbi, NULL)) return 0; - nm_i->nat_bits_blocks = F2FS_BYTES_TO_BLK((nat_bits_bytes << 1) + 8 + - F2FS_BLKSIZE - 1); + nm_i->nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8); nm_i->nat_bits = f2fs_kzalloc(sbi, nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, GFP_KERNEL); if (!nm_i->nat_bits) @@ -2707,12 +2728,20 @@ static int init_node_manager(struct f2fs_sb_info *sbi) static int init_free_nid_cache(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); + int i; - nm_i->free_nid_bitmap = f2fs_kvzalloc(sbi, nm_i->nat_blocks * - NAT_ENTRY_BITMAP_SIZE, GFP_KERNEL); + nm_i->free_nid_bitmap = f2fs_kzalloc(sbi, nm_i->nat_blocks * + sizeof(unsigned char *), GFP_KERNEL); if (!nm_i->free_nid_bitmap) return -ENOMEM; + for (i = 0; i < nm_i->nat_blocks; i++) { + nm_i->free_nid_bitmap[i] = f2fs_kvzalloc(sbi, + NAT_ENTRY_BITMAP_SIZE_ALIGNED, GFP_KERNEL); + if (!nm_i->free_nid_bitmap) + return -ENOMEM; + } + nm_i->nat_block_bitmap = f2fs_kvzalloc(sbi, nm_i->nat_blocks / 8, GFP_KERNEL); if (!nm_i->nat_block_bitmap) @@ -2803,7 +2832,13 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) up_write(&nm_i->nat_tree_lock); kvfree(nm_i->nat_block_bitmap); - kvfree(nm_i->free_nid_bitmap); + if (nm_i->free_nid_bitmap) { + int i; + + for (i = 0; i < nm_i->nat_blocks; i++) + kvfree(nm_i->free_nid_bitmap[i]); + kfree(nm_i->free_nid_bitmap); + } kvfree(nm_i->free_nid_count); kfree(nm_i->nat_bitmap); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 081ef0d672bf..b95e49e4a928 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -44,6 +44,7 @@ enum { HAS_FSYNCED_INODE, /* is the inode fsynced before? */ HAS_LAST_FSYNC, /* has the latest node fsync mark? */ IS_DIRTY, /* this nat entry is dirty? */ + IS_PREALLOC, /* nat entry is preallocated */ }; /* @@ -422,12 +423,12 @@ static inline void clear_inline_node(struct page *page) ClearPageChecked(page); } -static inline void set_cold_node(struct inode *inode, struct page *page) +static inline void set_cold_node(struct page *page, bool is_dir) { struct f2fs_node *rn = F2FS_NODE(page); unsigned int flag = le32_to_cpu(rn->footer.flag); - if (S_ISDIR(inode->i_mode)) + if (is_dir) flag &= ~(0x1 << COLD_BIT_SHIFT); else flag |= (0x1 << COLD_BIT_SHIFT); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 337f3363f48f..1b23d3febe4c 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -144,7 +144,7 @@ static int recover_dentry(struct inode *inode, struct page *ipage, retry: de = __f2fs_find_entry(dir, &fname, &page); if (de && inode->i_ino == le32_to_cpu(de->ino)) - goto out_unmap_put; + goto out_put; if (de) { einode = f2fs_iget_retry(inode->i_sb, le32_to_cpu(de->ino)); @@ -153,19 +153,19 @@ retry: err = PTR_ERR(einode); if (err == -ENOENT) err = -EEXIST; - goto out_unmap_put; + goto out_put; } err = dquot_initialize(einode); if (err) { iput(einode); - goto out_unmap_put; + goto out_put; } err = acquire_orphan_inode(F2FS_I_SB(inode)); if (err) { iput(einode); - goto out_unmap_put; + goto out_put; } f2fs_delete_entry(de, page, dir, einode); iput(einode); @@ -180,8 +180,7 @@ retry: goto retry; goto out; -out_unmap_put: - f2fs_dentry_kunmap(dir, page); +out_put: f2fs_put_page(page, 0); out: if (file_enc_name(inode)) @@ -243,6 +242,9 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, struct curseg_info *curseg; struct page *page = NULL; block_t blkaddr; + unsigned int loop_cnt = 0; + unsigned int free_blocks = sbi->user_block_count - + valid_user_blocks(sbi); int err = 0; /* get node pages in the current segment */ @@ -295,6 +297,17 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, if (IS_INODE(page) && is_dent_dnode(page)) entry->last_dentry = blkaddr; next: + /* sanity check in order to detect looped node chain */ + if (++loop_cnt >= free_blocks || + blkaddr == next_blkaddr_of_node(page)) { + f2fs_msg(sbi->sb, KERN_NOTICE, + "%s: detect looped node chain, " + "blkaddr:%u, next:%u", + __func__, blkaddr, next_blkaddr_of_node(page)); + err = -EINVAL; + break; + } + /* check next segment */ blkaddr = next_blkaddr_of_node(page); f2fs_put_page(page, 1); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b16a8e6625aa..5854cc4e1d67 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1411,12 +1411,11 @@ static int issue_discard_thread(void *data) if (kthread_should_stop()) return 0; - if (dcc->discard_wake) { + if (dcc->discard_wake) dcc->discard_wake = 0; - if (sbi->gc_thread && sbi->gc_thread->gc_urgent) - init_discard_policy(&dpolicy, - DPOLICY_FORCE, 1); - } + + if (sbi->gc_thread && sbi->gc_thread->gc_urgent) + init_discard_policy(&dpolicy, DPOLICY_FORCE, 1); sb_start_intwrite(sbi->sb); @@ -1485,7 +1484,7 @@ static int __issue_discard_async(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { #ifdef CONFIG_BLK_DEV_ZONED - if (f2fs_sb_mounted_blkzoned(sbi->sb) && + if (f2fs_sb_has_blkzoned(sbi->sb) && bdev_zoned_model(bdev) != BLK_ZONED_NONE) return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); #endif @@ -1683,7 +1682,7 @@ find_next: sbi->blocks_per_seg, cur_pos); len = next_pos - cur_pos; - if (f2fs_sb_mounted_blkzoned(sbi->sb) || + if (f2fs_sb_has_blkzoned(sbi->sb) || (force && len < cpc->trim_minlen)) goto skip; @@ -1727,7 +1726,7 @@ void init_discard_policy(struct discard_policy *dpolicy, } else if (discard_type == DPOLICY_FORCE) { dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; - dpolicy->io_aware = true; + dpolicy->io_aware = false; } else if (discard_type == DPOLICY_FSTRIM) { dpolicy->io_aware = false; } else if (discard_type == DPOLICY_UMOUNT) { @@ -1863,7 +1862,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) sbi->discard_blks--; /* don't overwrite by SSR to keep node chain */ - if (se->type == CURSEG_WARM_NODE) { + if (IS_NODESEG(se->type)) { if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map)) se->ckpt_valid_blocks++; } @@ -2164,11 +2163,17 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) if (sbi->segs_per_sec != 1) return CURSEG_I(sbi, type)->segno; - if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) + if (test_opt(sbi, NOHEAP) && + (type == CURSEG_HOT_DATA || IS_NODESEG(type))) return 0; if (SIT_I(sbi)->last_victim[ALLOC_NEXT]) return SIT_I(sbi)->last_victim[ALLOC_NEXT]; + + /* find segments from 0 to reuse freed segments */ + if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE) + return 0; + return CURSEG_I(sbi, type)->segno; } @@ -2455,6 +2460,101 @@ int rw_hint_to_seg_type(enum rw_hint hint) } } +/* This returns write hints for each segment type. This hints will be + * passed down to block layer. There are mapping tables which depend on + * the mount option 'whint_mode'. + * + * 1) whint_mode=off. F2FS only passes down WRITE_LIFE_NOT_SET. + * + * 2) whint_mode=user-based. F2FS tries to pass down hints given by users. + * + * User F2FS Block + * ---- ---- ----- + * META WRITE_LIFE_NOT_SET + * HOT_NODE " + * WARM_NODE " + * COLD_NODE " + * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME + * extension list " " + * + * -- buffered io + * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME + * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT + * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET + * WRITE_LIFE_NONE " " + * WRITE_LIFE_MEDIUM " " + * WRITE_LIFE_LONG " " + * + * -- direct io + * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME + * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT + * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET + * WRITE_LIFE_NONE " WRITE_LIFE_NONE + * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM + * WRITE_LIFE_LONG " WRITE_LIFE_LONG + * + * 3) whint_mode=fs-based. F2FS passes down hints with its policy. + * + * User F2FS Block + * ---- ---- ----- + * META WRITE_LIFE_MEDIUM; + * HOT_NODE WRITE_LIFE_NOT_SET + * WARM_NODE " + * COLD_NODE WRITE_LIFE_NONE + * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME + * extension list " " + * + * -- buffered io + * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME + * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT + * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_LONG + * WRITE_LIFE_NONE " " + * WRITE_LIFE_MEDIUM " " + * WRITE_LIFE_LONG " " + * + * -- direct io + * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME + * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT + * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET + * WRITE_LIFE_NONE " WRITE_LIFE_NONE + * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM + * WRITE_LIFE_LONG " WRITE_LIFE_LONG + */ + +enum rw_hint io_type_to_rw_hint(struct f2fs_sb_info *sbi, + enum page_type type, enum temp_type temp) +{ + if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) { + if (type == DATA) { + if (temp == WARM) + return WRITE_LIFE_NOT_SET; + else if (temp == HOT) + return WRITE_LIFE_SHORT; + else if (temp == COLD) + return WRITE_LIFE_EXTREME; + } else { + return WRITE_LIFE_NOT_SET; + } + } else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) { + if (type == DATA) { + if (temp == WARM) + return WRITE_LIFE_LONG; + else if (temp == HOT) + return WRITE_LIFE_SHORT; + else if (temp == COLD) + return WRITE_LIFE_EXTREME; + } else if (type == NODE) { + if (temp == WARM || temp == HOT) + return WRITE_LIFE_NOT_SET; + else if (temp == COLD) + return WRITE_LIFE_NONE; + } else if (type == META) { + return WRITE_LIFE_MEDIUM; + } + } + return WRITE_LIFE_NOT_SET; +} + static int __get_segment_type_2(struct f2fs_io_info *fio) { if (fio->type == DATA) @@ -2487,7 +2587,8 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) if (is_cold_data(fio->page) || file_is_cold(inode)) return CURSEG_COLD_DATA; - if (is_inode_flag_set(inode, FI_HOT_DATA)) + if (file_is_hot(inode) || + is_inode_flag_set(inode, FI_HOT_DATA)) return CURSEG_HOT_DATA; return rw_hint_to_seg_type(inode->i_write_hint); } else { @@ -2502,7 +2603,7 @@ static int __get_segment_type(struct f2fs_io_info *fio) { int type = 0; - switch (fio->sbi->active_logs) { + switch (F2FS_OPTION(fio->sbi).active_logs) { case 2: type = __get_segment_type_2(fio); break; @@ -2642,6 +2743,7 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, struct f2fs_io_info fio = { .sbi = sbi, .type = META, + .temp = HOT, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC | REQ_META | REQ_PRIO, .old_blkaddr = page->index, @@ -2688,8 +2790,15 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) int rewrite_data_page(struct f2fs_io_info *fio) { int err; + struct f2fs_sb_info *sbi = fio->sbi; fio->new_blkaddr = fio->old_blkaddr; + /* i/o temperature is needed for passing down write hints */ + __get_segment_type(fio); + + f2fs_bug_on(sbi, !IS_DATASEG(get_seg_entry(sbi, + GET_SEGNO(sbi, fio->new_blkaddr))->type)); + stat_inc_inplace_blocks(fio->sbi); err = f2fs_submit_page_bio(fio); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index f11c4bc82c78..3325d0769723 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -53,13 +53,19 @@ ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ (sbi)->segs_per_sec)) \ -#define MAIN_BLKADDR(sbi) (SM_I(sbi)->main_blkaddr) -#define SEG0_BLKADDR(sbi) (SM_I(sbi)->seg0_blkaddr) +#define MAIN_BLKADDR(sbi) \ + (SM_I(sbi) ? SM_I(sbi)->main_blkaddr : \ + le32_to_cpu(F2FS_RAW_SUPER(sbi)->main_blkaddr)) +#define SEG0_BLKADDR(sbi) \ + (SM_I(sbi) ? SM_I(sbi)->seg0_blkaddr : \ + le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment0_blkaddr)) #define MAIN_SEGS(sbi) (SM_I(sbi)->main_segments) #define MAIN_SECS(sbi) ((sbi)->total_sections) -#define TOTAL_SEGS(sbi) (SM_I(sbi)->segment_count) +#define TOTAL_SEGS(sbi) \ + (SM_I(sbi) ? SM_I(sbi)->segment_count : \ + le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count)) #define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << (sbi)->log_blocks_per_seg) #define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi)) @@ -596,6 +602,8 @@ static inline int utilization(struct f2fs_sb_info *sbi) #define DEF_MIN_FSYNC_BLOCKS 8 #define DEF_MIN_HOT_BLOCKS 16 +#define SMALL_VOLUME_SEGMENTS (16 * 512) /* 16GB */ + enum { F2FS_IPU_FORCE, F2FS_IPU_SSR, @@ -630,10 +638,17 @@ static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) f2fs_bug_on(sbi, segno > TOTAL_SEGS(sbi) - 1); } -static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) +static inline void verify_block_addr(struct f2fs_io_info *fio, block_t blk_addr) { - BUG_ON(blk_addr < SEG0_BLKADDR(sbi) - || blk_addr >= MAX_BLKADDR(sbi)); + struct f2fs_sb_info *sbi = fio->sbi; + + if (PAGE_TYPE_OF_BIO(fio->type) == META && + (!is_read_io(fio->op) || fio->is_meta)) + BUG_ON(blk_addr < SEG0_BLKADDR(sbi) || + blk_addr >= MAIN_BLKADDR(sbi)); + else + BUG_ON(blk_addr < MAIN_BLKADDR(sbi) || + blk_addr >= MAX_BLKADDR(sbi)); } /* diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8173ae688814..42d564c5ccd0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -60,7 +60,7 @@ char *fault_name[FAULT_MAX] = { static void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate) { - struct f2fs_fault_info *ffi = &sbi->fault_info; + struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info; if (rate) { atomic_set(&ffi->inject_ops, 0); @@ -129,6 +129,10 @@ enum { Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, + Opt_whint, + Opt_alloc, + Opt_fsync, + Opt_test_dummy_encryption, Opt_err, }; @@ -182,6 +186,10 @@ static match_table_t f2fs_tokens = { {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, + {Opt_whint, "whint_mode=%s"}, + {Opt_alloc, "alloc_mode=%s"}, + {Opt_fsync, "fsync_mode=%s"}, + {Opt_test_dummy_encryption, "test_dummy_encryption"}, {Opt_err, NULL}, }; @@ -202,21 +210,24 @@ static inline void limit_reserve_root(struct f2fs_sb_info *sbi) block_t limit = (sbi->user_block_count << 1) / 1000; /* limit is 0.2% */ - if (test_opt(sbi, RESERVE_ROOT) && sbi->root_reserved_blocks > limit) { - sbi->root_reserved_blocks = limit; + if (test_opt(sbi, RESERVE_ROOT) && + F2FS_OPTION(sbi).root_reserved_blocks > limit) { + F2FS_OPTION(sbi).root_reserved_blocks = limit; f2fs_msg(sbi->sb, KERN_INFO, "Reduce reserved blocks for root = %u", - sbi->root_reserved_blocks); + F2FS_OPTION(sbi).root_reserved_blocks); } if (!test_opt(sbi, RESERVE_ROOT) && - (!uid_eq(sbi->s_resuid, + (!uid_eq(F2FS_OPTION(sbi).s_resuid, make_kuid(&init_user_ns, F2FS_DEF_RESUID)) || - !gid_eq(sbi->s_resgid, + !gid_eq(F2FS_OPTION(sbi).s_resgid, make_kgid(&init_user_ns, F2FS_DEF_RESGID)))) f2fs_msg(sbi->sb, KERN_INFO, "Ignore s_resuid=%u, s_resgid=%u w/o reserve_root", - from_kuid_munged(&init_user_ns, sbi->s_resuid), - from_kgid_munged(&init_user_ns, sbi->s_resgid)); + from_kuid_munged(&init_user_ns, + F2FS_OPTION(sbi).s_resuid), + from_kgid_munged(&init_user_ns, + F2FS_OPTION(sbi).s_resgid)); } static void init_once(void *foo) @@ -236,7 +247,7 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype, char *qname; int ret = -EINVAL; - if (sb_any_quota_loaded(sb) && !sbi->s_qf_names[qtype]) { + if (sb_any_quota_loaded(sb) && !F2FS_OPTION(sbi).s_qf_names[qtype]) { f2fs_msg(sb, KERN_ERR, "Cannot change journaled " "quota options when quota turned on"); @@ -254,8 +265,8 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype, "Not enough memory for storing quotafile name"); return -EINVAL; } - if (sbi->s_qf_names[qtype]) { - if (strcmp(sbi->s_qf_names[qtype], qname) == 0) + if (F2FS_OPTION(sbi).s_qf_names[qtype]) { + if (strcmp(F2FS_OPTION(sbi).s_qf_names[qtype], qname) == 0) ret = 0; else f2fs_msg(sb, KERN_ERR, @@ -268,7 +279,7 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype, "quotafile must be on filesystem root"); goto errout; } - sbi->s_qf_names[qtype] = qname; + F2FS_OPTION(sbi).s_qf_names[qtype] = qname; set_opt(sbi, QUOTA); return 0; errout: @@ -280,13 +291,13 @@ static int f2fs_clear_qf_name(struct super_block *sb, int qtype) { struct f2fs_sb_info *sbi = F2FS_SB(sb); - if (sb_any_quota_loaded(sb) && sbi->s_qf_names[qtype]) { + if (sb_any_quota_loaded(sb) && F2FS_OPTION(sbi).s_qf_names[qtype]) { f2fs_msg(sb, KERN_ERR, "Cannot change journaled quota options" " when quota turned on"); return -EINVAL; } - kfree(sbi->s_qf_names[qtype]); - sbi->s_qf_names[qtype] = NULL; + kfree(F2FS_OPTION(sbi).s_qf_names[qtype]); + F2FS_OPTION(sbi).s_qf_names[qtype] = NULL; return 0; } @@ -302,15 +313,19 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) "Cannot enable project quota enforcement."); return -1; } - if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA] || - sbi->s_qf_names[PRJQUOTA]) { - if (test_opt(sbi, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) + if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA] || + F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] || + F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) { + if (test_opt(sbi, USRQUOTA) && + F2FS_OPTION(sbi).s_qf_names[USRQUOTA]) clear_opt(sbi, USRQUOTA); - if (test_opt(sbi, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) + if (test_opt(sbi, GRPQUOTA) && + F2FS_OPTION(sbi).s_qf_names[GRPQUOTA]) clear_opt(sbi, GRPQUOTA); - if (test_opt(sbi, PRJQUOTA) && sbi->s_qf_names[PRJQUOTA]) + if (test_opt(sbi, PRJQUOTA) && + F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) clear_opt(sbi, PRJQUOTA); if (test_opt(sbi, GRPQUOTA) || test_opt(sbi, USRQUOTA) || @@ -320,19 +335,19 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) return -1; } - if (!sbi->s_jquota_fmt) { + if (!F2FS_OPTION(sbi).s_jquota_fmt) { f2fs_msg(sbi->sb, KERN_ERR, "journaled quota format " "not specified"); return -1; } } - if (f2fs_sb_has_quota_ino(sbi->sb) && sbi->s_jquota_fmt) { + if (f2fs_sb_has_quota_ino(sbi->sb) && F2FS_OPTION(sbi).s_jquota_fmt) { f2fs_msg(sbi->sb, KERN_INFO, "QUOTA feature is enabled, so ignore jquota_fmt"); - sbi->s_jquota_fmt = 0; + F2FS_OPTION(sbi).s_jquota_fmt = 0; } - if (f2fs_sb_has_quota_ino(sbi->sb) && sb_rdonly(sbi->sb)) { + if (f2fs_sb_has_quota_ino(sbi->sb) && f2fs_readonly(sbi->sb)) { f2fs_msg(sbi->sb, KERN_INFO, "Filesystem with quota feature cannot be mounted RDWR " "without CONFIG_QUOTA"); @@ -403,14 +418,14 @@ static int parse_options(struct super_block *sb, char *options) q = bdev_get_queue(sb->s_bdev); if (blk_queue_discard(q)) { set_opt(sbi, DISCARD); - } else if (!f2fs_sb_mounted_blkzoned(sb)) { + } else if (!f2fs_sb_has_blkzoned(sb)) { f2fs_msg(sb, KERN_WARNING, "mounting with \"discard\" option, but " "the device does not support discard"); } break; case Opt_nodiscard: - if (f2fs_sb_mounted_blkzoned(sb)) { + if (f2fs_sb_has_blkzoned(sb)) { f2fs_msg(sb, KERN_WARNING, "discard is required for zoned block devices"); return -EINVAL; @@ -440,7 +455,7 @@ static int parse_options(struct super_block *sb, char *options) if (args->from && match_int(args, &arg)) return -EINVAL; set_opt(sbi, INLINE_XATTR_SIZE); - sbi->inline_xattr_size = arg; + F2FS_OPTION(sbi).inline_xattr_size = arg; break; #else case Opt_user_xattr: @@ -480,7 +495,7 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE) return -EINVAL; - sbi->active_logs = arg; + F2FS_OPTION(sbi).active_logs = arg; break; case Opt_disable_ext_identify: set_opt(sbi, DISABLE_EXT_IDENTIFY); @@ -524,9 +539,9 @@ static int parse_options(struct super_block *sb, char *options) if (test_opt(sbi, RESERVE_ROOT)) { f2fs_msg(sb, KERN_INFO, "Preserve previous reserve_root=%u", - sbi->root_reserved_blocks); + F2FS_OPTION(sbi).root_reserved_blocks); } else { - sbi->root_reserved_blocks = arg; + F2FS_OPTION(sbi).root_reserved_blocks = arg; set_opt(sbi, RESERVE_ROOT); } break; @@ -539,7 +554,7 @@ static int parse_options(struct super_block *sb, char *options) "Invalid uid value %d", arg); return -EINVAL; } - sbi->s_resuid = uid; + F2FS_OPTION(sbi).s_resuid = uid; break; case Opt_resgid: if (args->from && match_int(args, &arg)) @@ -550,7 +565,7 @@ static int parse_options(struct super_block *sb, char *options) "Invalid gid value %d", arg); return -EINVAL; } - sbi->s_resgid = gid; + F2FS_OPTION(sbi).s_resgid = gid; break; case Opt_mode: name = match_strdup(&args[0]); @@ -559,7 +574,7 @@ static int parse_options(struct super_block *sb, char *options) return -ENOMEM; if (strlen(name) == 8 && !strncmp(name, "adaptive", 8)) { - if (f2fs_sb_mounted_blkzoned(sb)) { + if (f2fs_sb_has_blkzoned(sb)) { f2fs_msg(sb, KERN_WARNING, "adaptive mode is not allowed with " "zoned block device feature"); @@ -585,7 +600,7 @@ static int parse_options(struct super_block *sb, char *options) 1 << arg, BIO_MAX_PAGES); return -EINVAL; } - sbi->write_io_size_bits = arg; + F2FS_OPTION(sbi).write_io_size_bits = arg; break; case Opt_fault_injection: if (args->from && match_int(args, &arg)) @@ -646,13 +661,13 @@ static int parse_options(struct super_block *sb, char *options) return ret; break; case Opt_jqfmt_vfsold: - sbi->s_jquota_fmt = QFMT_VFS_OLD; + F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_OLD; break; case Opt_jqfmt_vfsv0: - sbi->s_jquota_fmt = QFMT_VFS_V0; + F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_V0; break; case Opt_jqfmt_vfsv1: - sbi->s_jquota_fmt = QFMT_VFS_V1; + F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_V1; break; case Opt_noquota: clear_opt(sbi, QUOTA); @@ -679,6 +694,73 @@ static int parse_options(struct super_block *sb, char *options) "quota operations not supported"); break; #endif + case Opt_whint: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + if (strlen(name) == 10 && + !strncmp(name, "user-based", 10)) { + F2FS_OPTION(sbi).whint_mode = WHINT_MODE_USER; + } else if (strlen(name) == 3 && + !strncmp(name, "off", 3)) { + F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; + } else if (strlen(name) == 8 && + !strncmp(name, "fs-based", 8)) { + F2FS_OPTION(sbi).whint_mode = WHINT_MODE_FS; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; + case Opt_alloc: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + + if (strlen(name) == 7 && + !strncmp(name, "default", 7)) { + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; + } else if (strlen(name) == 5 && + !strncmp(name, "reuse", 5)) { + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; + case Opt_fsync: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + if (strlen(name) == 5 && + !strncmp(name, "posix", 5)) { + F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; + } else if (strlen(name) == 6 && + !strncmp(name, "strict", 6)) { + F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_STRICT; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; + case Opt_test_dummy_encryption: +#ifdef CONFIG_F2FS_FS_ENCRYPTION + if (!f2fs_sb_has_encrypt(sb)) { + f2fs_msg(sb, KERN_ERR, "Encrypt feature is off"); + return -EINVAL; + } + + F2FS_OPTION(sbi).test_dummy_encryption = true; + f2fs_msg(sb, KERN_INFO, + "Test dummy encryption mode enabled"); +#else + f2fs_msg(sb, KERN_INFO, + "Test dummy encryption mount option ignored"); +#endif + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -699,14 +781,22 @@ static int parse_options(struct super_block *sb, char *options) } if (test_opt(sbi, INLINE_XATTR_SIZE)) { + if (!f2fs_sb_has_extra_attr(sb) || + !f2fs_sb_has_flexible_inline_xattr(sb)) { + f2fs_msg(sb, KERN_ERR, + "extra_attr or flexible_inline_xattr " + "feature is off"); + return -EINVAL; + } if (!test_opt(sbi, INLINE_XATTR)) { f2fs_msg(sb, KERN_ERR, "inline_xattr_size option should be " "set with inline_xattr option"); return -EINVAL; } - if (!sbi->inline_xattr_size || - sbi->inline_xattr_size >= DEF_ADDRS_PER_INODE - + if (!F2FS_OPTION(sbi).inline_xattr_size || + F2FS_OPTION(sbi).inline_xattr_size >= + DEF_ADDRS_PER_INODE - F2FS_TOTAL_EXTRA_ATTR_SIZE - DEF_INLINE_RESERVED_SIZE - DEF_MIN_INLINE_SIZE) { @@ -715,6 +805,12 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; } } + + /* Not pass down write hints if the number of active logs is lesser + * than NR_CURSEG_TYPE. + */ + if (F2FS_OPTION(sbi).active_logs != NR_CURSEG_TYPE) + F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; return 0; } @@ -731,7 +827,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) /* Initialize f2fs-specific inode info */ atomic_set(&fi->dirty_pages, 0); fi->i_current_depth = 1; - fi->i_advise = 0; init_rwsem(&fi->i_sem); INIT_LIST_HEAD(&fi->dirty_list); INIT_LIST_HEAD(&fi->gdirty_list); @@ -743,10 +838,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_rwsem(&fi->i_mmap_sem); init_rwsem(&fi->i_xattr_sem); -#ifdef CONFIG_QUOTA - memset(&fi->i_dquot, 0, sizeof(fi->i_dquot)); - fi->i_reserved_quota = 0; -#endif /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; @@ -956,7 +1047,7 @@ static void f2fs_put_super(struct super_block *sb) mempool_destroy(sbi->write_io_dummy); #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) - kfree(sbi->s_qf_names[i]); + kfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif destroy_percpu_info(sbi); for (i = 0; i < NR_PAGE_TYPE; i++) @@ -1070,8 +1161,9 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = total_count - start_count; buf->f_bfree = user_block_count - valid_user_blocks(sbi) - sbi->current_reserved_blocks; - if (buf->f_bfree > sbi->root_reserved_blocks) - buf->f_bavail = buf->f_bfree - sbi->root_reserved_blocks; + if (buf->f_bfree > F2FS_OPTION(sbi).root_reserved_blocks) + buf->f_bavail = buf->f_bfree - + F2FS_OPTION(sbi).root_reserved_blocks; else buf->f_bavail = 0; @@ -1106,10 +1198,10 @@ static inline void f2fs_show_quota_options(struct seq_file *seq, #ifdef CONFIG_QUOTA struct f2fs_sb_info *sbi = F2FS_SB(sb); - if (sbi->s_jquota_fmt) { + if (F2FS_OPTION(sbi).s_jquota_fmt) { char *fmtname = ""; - switch (sbi->s_jquota_fmt) { + switch (F2FS_OPTION(sbi).s_jquota_fmt) { case QFMT_VFS_OLD: fmtname = "vfsold"; break; @@ -1123,14 +1215,17 @@ static inline void f2fs_show_quota_options(struct seq_file *seq, seq_printf(seq, ",jqfmt=%s", fmtname); } - if (sbi->s_qf_names[USRQUOTA]) - seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]); + if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA]) + seq_show_option(seq, "usrjquota", + F2FS_OPTION(sbi).s_qf_names[USRQUOTA]); - if (sbi->s_qf_names[GRPQUOTA]) - seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]); + if (F2FS_OPTION(sbi).s_qf_names[GRPQUOTA]) + seq_show_option(seq, "grpjquota", + F2FS_OPTION(sbi).s_qf_names[GRPQUOTA]); - if (sbi->s_qf_names[PRJQUOTA]) - seq_show_option(seq, "prjjquota", sbi->s_qf_names[PRJQUOTA]); + if (F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) + seq_show_option(seq, "prjjquota", + F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]); #endif } @@ -1165,7 +1260,7 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",noinline_xattr"); if (test_opt(sbi, INLINE_XATTR_SIZE)) seq_printf(seq, ",inline_xattr_size=%u", - sbi->inline_xattr_size); + F2FS_OPTION(sbi).inline_xattr_size); #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL if (test_opt(sbi, POSIX_ACL)) @@ -1201,18 +1296,20 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, "adaptive"); else if (test_opt(sbi, LFS)) seq_puts(seq, "lfs"); - seq_printf(seq, ",active_logs=%u", sbi->active_logs); + seq_printf(seq, ",active_logs=%u", F2FS_OPTION(sbi).active_logs); if (test_opt(sbi, RESERVE_ROOT)) seq_printf(seq, ",reserve_root=%u,resuid=%u,resgid=%u", - sbi->root_reserved_blocks, - from_kuid_munged(&init_user_ns, sbi->s_resuid), - from_kgid_munged(&init_user_ns, sbi->s_resgid)); + F2FS_OPTION(sbi).root_reserved_blocks, + from_kuid_munged(&init_user_ns, + F2FS_OPTION(sbi).s_resuid), + from_kgid_munged(&init_user_ns, + F2FS_OPTION(sbi).s_resgid)); if (F2FS_IO_SIZE_BITS(sbi)) seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi)); #ifdef CONFIG_F2FS_FAULT_INJECTION if (test_opt(sbi, FAULT_INJECTION)) seq_printf(seq, ",fault_injection=%u", - sbi->fault_info.inject_rate); + F2FS_OPTION(sbi).fault_info.inject_rate); #endif #ifdef CONFIG_QUOTA if (test_opt(sbi, QUOTA)) @@ -1225,15 +1322,37 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",prjquota"); #endif f2fs_show_quota_options(seq, sbi->sb); + if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) + seq_printf(seq, ",whint_mode=%s", "user-based"); + else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) + seq_printf(seq, ",whint_mode=%s", "fs-based"); +#ifdef CONFIG_F2FS_FS_ENCRYPTION + if (F2FS_OPTION(sbi).test_dummy_encryption) + seq_puts(seq, ",test_dummy_encryption"); +#endif + + if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_DEFAULT) + seq_printf(seq, ",alloc_mode=%s", "default"); + else if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE) + seq_printf(seq, ",alloc_mode=%s", "reuse"); + if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_POSIX) + seq_printf(seq, ",fsync_mode=%s", "posix"); + else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) + seq_printf(seq, ",fsync_mode=%s", "strict"); return 0; } static void default_options(struct f2fs_sb_info *sbi) { /* init some FS parameters */ - sbi->active_logs = NR_CURSEG_TYPE; - sbi->inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; + F2FS_OPTION(sbi).active_logs = NR_CURSEG_TYPE; + F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; + F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; + F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; + F2FS_OPTION(sbi).test_dummy_encryption = false; + sbi->readdir_ra = 1; set_opt(sbi, BG_GC); set_opt(sbi, INLINE_XATTR); @@ -1243,7 +1362,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, NOHEAP); sbi->sb->s_flags |= SB_LAZYTIME; set_opt(sbi, FLUSH_MERGE); - if (f2fs_sb_mounted_blkzoned(sbi->sb)) { + if (f2fs_sb_has_blkzoned(sbi->sb)) { set_opt_mode(sbi, F2FS_MOUNT_LFS); set_opt(sbi, DISCARD); } else { @@ -1270,16 +1389,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) struct f2fs_sb_info *sbi = F2FS_SB(sb); struct f2fs_mount_info org_mount_opt; unsigned long old_sb_flags; - int err, active_logs; + int err; bool need_restart_gc = false; bool need_stop_gc = false; bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); -#ifdef CONFIG_F2FS_FAULT_INJECTION - struct f2fs_fault_info ffi = sbi->fault_info; -#endif #ifdef CONFIG_QUOTA - int s_jquota_fmt; - char *s_qf_names[MAXQUOTAS]; int i, j; #endif @@ -1289,21 +1403,21 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) */ org_mount_opt = sbi->mount_opt; old_sb_flags = sb->s_flags; - active_logs = sbi->active_logs; #ifdef CONFIG_QUOTA - s_jquota_fmt = sbi->s_jquota_fmt; + org_mount_opt.s_jquota_fmt = F2FS_OPTION(sbi).s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) { - if (sbi->s_qf_names[i]) { - s_qf_names[i] = kstrdup(sbi->s_qf_names[i], - GFP_KERNEL); - if (!s_qf_names[i]) { + if (F2FS_OPTION(sbi).s_qf_names[i]) { + org_mount_opt.s_qf_names[i] = + kstrdup(F2FS_OPTION(sbi).s_qf_names[i], + GFP_KERNEL); + if (!org_mount_opt.s_qf_names[i]) { for (j = 0; j < i; j++) - kfree(s_qf_names[j]); + kfree(org_mount_opt.s_qf_names[j]); return -ENOMEM; } } else { - s_qf_names[i] = NULL; + org_mount_opt.s_qf_names[i] = NULL; } } #endif @@ -1373,7 +1487,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) need_stop_gc = true; } - if (*flags & SB_RDONLY) { + if (*flags & SB_RDONLY || + F2FS_OPTION(sbi).whint_mode != org_mount_opt.whint_mode) { writeback_inodes_sb(sb, WB_REASON_SYNC); sync_inodes_sb(sb); @@ -1399,7 +1514,7 @@ skip: #ifdef CONFIG_QUOTA /* Release old quota file names */ for (i = 0; i < MAXQUOTAS; i++) - kfree(s_qf_names[i]); + kfree(org_mount_opt.s_qf_names[i]); #endif /* Update the POSIXACL Flag */ sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | @@ -1417,18 +1532,14 @@ restore_gc: } restore_opts: #ifdef CONFIG_QUOTA - sbi->s_jquota_fmt = s_jquota_fmt; + F2FS_OPTION(sbi).s_jquota_fmt = org_mount_opt.s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) { - kfree(sbi->s_qf_names[i]); - sbi->s_qf_names[i] = s_qf_names[i]; + kfree(F2FS_OPTION(sbi).s_qf_names[i]); + F2FS_OPTION(sbi).s_qf_names[i] = org_mount_opt.s_qf_names[i]; } #endif sbi->mount_opt = org_mount_opt; - sbi->active_logs = active_logs; sb->s_flags = old_sb_flags; -#ifdef CONFIG_F2FS_FAULT_INJECTION - sbi->fault_info = ffi; -#endif return err; } @@ -1456,7 +1567,7 @@ static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data, while (toread > 0) { tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread); repeat: - page = read_mapping_page(mapping, blkidx, NULL); + page = read_cache_page_gfp(mapping, blkidx, GFP_NOFS); if (IS_ERR(page)) { if (PTR_ERR(page) == -ENOMEM) { congestion_wait(BLK_RW_ASYNC, HZ/50); @@ -1550,8 +1661,8 @@ static qsize_t *f2fs_get_reserved_space(struct inode *inode) static int f2fs_quota_on_mount(struct f2fs_sb_info *sbi, int type) { - return dquot_quota_on_mount(sbi->sb, sbi->s_qf_names[type], - sbi->s_jquota_fmt, type); + return dquot_quota_on_mount(sbi->sb, F2FS_OPTION(sbi).s_qf_names[type], + F2FS_OPTION(sbi).s_jquota_fmt, type); } int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly) @@ -1570,7 +1681,7 @@ int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly) } for (i = 0; i < MAXQUOTAS; i++) { - if (sbi->s_qf_names[i]) { + if (F2FS_OPTION(sbi).s_qf_names[i]) { err = f2fs_quota_on_mount(sbi, i); if (!err) { enabled = 1; @@ -1797,11 +1908,28 @@ static int f2fs_get_context(struct inode *inode, void *ctx, size_t len) static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, void *fs_data) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + /* + * Encrypting the root directory is not allowed because fsck + * expects lost+found directory to exist and remain unencrypted + * if LOST_FOUND feature is enabled. + * + */ + if (f2fs_sb_has_lost_found(sbi->sb) && + inode->i_ino == F2FS_ROOT_INO(sbi)) + return -EPERM; + return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len, fs_data, XATTR_CREATE); } +static bool f2fs_dummy_context(struct inode *inode) +{ + return DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(inode)); +} + static unsigned f2fs_max_namelen(struct inode *inode) { return S_ISLNK(inode->i_mode) ? @@ -1812,6 +1940,7 @@ static const struct fscrypt_operations f2fs_cryptops = { .key_prefix = "f2fs:", .get_context = f2fs_get_context, .set_context = f2fs_set_context, + .dummy_context = f2fs_dummy_context, .empty_dir = f2fs_empty_dir, .max_namelen = f2fs_max_namelen, }; @@ -1894,7 +2023,6 @@ static int __f2fs_commit_super(struct buffer_head *bh, lock_buffer(bh); if (super) memcpy(bh->b_data + F2FS_SUPER_OFFSET, super, sizeof(*super)); - set_buffer_uptodate(bh); set_buffer_dirty(bh); unlock_buffer(bh); @@ -2181,6 +2309,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->dirty_device = 0; spin_lock_init(&sbi->dev_lock); + + init_rwsem(&sbi->sb_lock); } static int init_percpu_info(struct f2fs_sb_info *sbi) @@ -2206,7 +2336,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) unsigned int n = 0; int err = -EIO; - if (!f2fs_sb_mounted_blkzoned(sbi->sb)) + if (!f2fs_sb_has_blkzoned(sbi->sb)) return 0; if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != @@ -2334,7 +2464,7 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) } /* write back-up superblock first */ - bh = sb_getblk(sbi->sb, sbi->valid_super_block ? 0: 1); + bh = sb_bread(sbi->sb, sbi->valid_super_block ? 0 : 1); if (!bh) return -EIO; err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi)); @@ -2345,7 +2475,7 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) return err; /* write current valid superblock */ - bh = sb_getblk(sbi->sb, sbi->valid_super_block); + bh = sb_bread(sbi->sb, sbi->valid_super_block); if (!bh) return -EIO; err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi)); @@ -2413,7 +2543,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) #ifdef CONFIG_BLK_DEV_ZONED if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM && - !f2fs_sb_mounted_blkzoned(sbi->sb)) { + !f2fs_sb_has_blkzoned(sbi->sb)) { f2fs_msg(sbi->sb, KERN_ERR, "Zoned block device feature not enabled\n"); return -EINVAL; @@ -2447,6 +2577,18 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) return 0; } +static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) +{ + struct f2fs_sm_info *sm_i = SM_I(sbi); + + /* adjust parameters according to the volume size */ + if (sm_i->main_segments <= SMALL_VOLUME_SEGMENTS) { + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; + sm_i->dcc_info->discard_granularity = 1; + sm_i->ipu_policy = 1 << F2FS_IPU_FORCE; + } +} + static int f2fs_fill_super(struct super_block *sb, void *data, int silent) { struct f2fs_sb_info *sbi; @@ -2494,8 +2636,8 @@ try_onemore: sb->s_fs_info = sbi; sbi->raw_super = raw_super; - sbi->s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); - sbi->s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); + F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); + F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); /* precompute checksum seed for metadata */ if (f2fs_sb_has_inode_chksum(sb)) @@ -2508,7 +2650,7 @@ try_onemore: * devices, but mandatory for host-managed zoned block devices. */ #ifndef CONFIG_BLK_DEV_ZONED - if (f2fs_sb_mounted_blkzoned(sb)) { + if (f2fs_sb_has_blkzoned(sb)) { f2fs_msg(sb, KERN_ERR, "Zoned block device support is not enabled\n"); err = -EOPNOTSUPP; @@ -2724,7 +2866,7 @@ try_onemore: * Turn on quotas which were not enabled for read-only mounts if * filesystem has quota feature, so that they are updated correctly. */ - if (f2fs_sb_has_quota_ino(sb) && !sb_rdonly(sb)) { + if (f2fs_sb_has_quota_ino(sb) && !f2fs_readonly(sb)) { err = f2fs_enable_quotas(sb); if (err) { f2fs_msg(sb, KERN_ERR, @@ -2799,6 +2941,8 @@ skip_recovery: f2fs_join_shrinker(sbi); + f2fs_tuning_parameters(sbi); + f2fs_msg(sbi->sb, KERN_NOTICE, "Mounted with checkpoint version = %llx", cur_cp_version(F2FS_CKPT(sbi))); f2fs_update_time(sbi, CP_TIME); @@ -2807,7 +2951,7 @@ skip_recovery: free_meta: #ifdef CONFIG_QUOTA - if (f2fs_sb_has_quota_ino(sb) && !sb_rdonly(sb)) + if (f2fs_sb_has_quota_ino(sb) && !f2fs_readonly(sb)) f2fs_quota_off_umount(sbi->sb); #endif f2fs_sync_inode_meta(sbi); @@ -2851,7 +2995,7 @@ free_bio_info: free_options: #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) - kfree(sbi->s_qf_names[i]); + kfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif kfree(options); free_sb_buf: diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index d978c7b6ea04..f33a56d6e6dd 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -58,7 +58,7 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) #ifdef CONFIG_F2FS_FAULT_INJECTION else if (struct_type == FAULT_INFO_RATE || struct_type == FAULT_INFO_TYPE) - return (unsigned char *)&sbi->fault_info; + return (unsigned char *)&F2FS_OPTION(sbi).fault_info; #endif return NULL; } @@ -92,10 +92,10 @@ static ssize_t features_show(struct f2fs_attr *a, if (!sb->s_bdev->bd_part) return snprintf(buf, PAGE_SIZE, "0\n"); - if (f2fs_sb_has_crypto(sb)) + if (f2fs_sb_has_encrypt(sb)) len += snprintf(buf, PAGE_SIZE - len, "%s", "encryption"); - if (f2fs_sb_mounted_blkzoned(sb)) + if (f2fs_sb_has_blkzoned(sb)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "blkzoned"); if (f2fs_sb_has_extra_attr(sb)) @@ -116,6 +116,9 @@ static ssize_t features_show(struct f2fs_attr *a, if (f2fs_sb_has_inode_crtime(sb)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "inode_crtime"); + if (f2fs_sb_has_lost_found(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "lost_found"); len += snprintf(buf + len, PAGE_SIZE - len, "\n"); return len; } @@ -136,6 +139,27 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, if (!ptr) return -EINVAL; + if (!strcmp(a->attr.name, "extension_list")) { + __u8 (*extlist)[F2FS_EXTENSION_LEN] = + sbi->raw_super->extension_list; + int cold_count = le32_to_cpu(sbi->raw_super->extension_count); + int hot_count = sbi->raw_super->hot_ext_count; + int len = 0, i; + + len += snprintf(buf + len, PAGE_SIZE - len, + "cold file extenstion:\n"); + for (i = 0; i < cold_count; i++) + len += snprintf(buf + len, PAGE_SIZE - len, "%s\n", + extlist[i]); + + len += snprintf(buf + len, PAGE_SIZE - len, + "hot file extenstion:\n"); + for (i = cold_count; i < cold_count + hot_count; i++) + len += snprintf(buf + len, PAGE_SIZE - len, "%s\n", + extlist[i]); + return len; + } + ui = (unsigned int *)(ptr + a->offset); return snprintf(buf, PAGE_SIZE, "%u\n", *ui); @@ -154,6 +178,41 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, if (!ptr) return -EINVAL; + if (!strcmp(a->attr.name, "extension_list")) { + const char *name = strim((char *)buf); + bool set = true, hot; + + if (!strncmp(name, "[h]", 3)) + hot = true; + else if (!strncmp(name, "[c]", 3)) + hot = false; + else + return -EINVAL; + + name += 3; + + if (*name == '!') { + name++; + set = false; + } + + if (strlen(name) >= F2FS_EXTENSION_LEN) + return -EINVAL; + + down_write(&sbi->sb_lock); + + ret = update_extension_list(sbi, name, hot, set); + if (ret) + goto out; + + ret = f2fs_commit_super(sbi, false); + if (ret) + update_extension_list(sbi, name, hot, !set); +out: + up_write(&sbi->sb_lock); + return ret ? ret : count; + } + ui = (unsigned int *)(ptr + a->offset); ret = kstrtoul(skip_spaces(buf), 0, &t); @@ -166,7 +225,7 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, if (a->struct_type == RESERVED_BLOCKS) { spin_lock(&sbi->stat_lock); if (t > (unsigned long)(sbi->user_block_count - - sbi->root_reserved_blocks)) { + F2FS_OPTION(sbi).root_reserved_blocks)) { spin_unlock(&sbi->stat_lock); return -EINVAL; } @@ -236,6 +295,7 @@ enum feat_id { FEAT_FLEXIBLE_INLINE_XATTR, FEAT_QUOTA_INO, FEAT_INODE_CRTIME, + FEAT_LOST_FOUND, }; static ssize_t f2fs_feature_show(struct f2fs_attr *a, @@ -251,6 +311,7 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a, case FEAT_FLEXIBLE_INLINE_XATTR: case FEAT_QUOTA_INO: case FEAT_INODE_CRTIME: + case FEAT_LOST_FOUND: return snprintf(buf, PAGE_SIZE, "supported\n"); } return 0; @@ -307,6 +368,7 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold); +F2FS_RW_ATTR(F2FS_SBI, f2fs_super_block, extension_list, extension_list); #ifdef CONFIG_F2FS_FAULT_INJECTION F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); @@ -329,6 +391,7 @@ F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM); F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR); F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO); F2FS_FEATURE_RO_ATTR(inode_crtime, FEAT_INODE_CRTIME); +F2FS_FEATURE_RO_ATTR(lost_found, FEAT_LOST_FOUND); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -357,6 +420,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(iostat_enable), ATTR_LIST(readdir_ra), ATTR_LIST(gc_pin_file_thresh), + ATTR_LIST(extension_list), #ifdef CONFIG_F2FS_FAULT_INJECTION ATTR_LIST(inject_rate), ATTR_LIST(inject_type), @@ -383,6 +447,7 @@ static struct attribute *f2fs_feat_attrs[] = { ATTR_LIST(flexible_inline_xattr), ATTR_LIST(quota_ino), ATTR_LIST(inode_crtime), + ATTR_LIST(lost_found), NULL, }; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d4d04fee568a..4b12ba70a895 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -347,9 +347,9 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) * By the time control reaches here, RCU grace period has passed * since I_WB_SWITCH assertion and all wb stat update transactions * between unlocked_inode_to_wb_begin/end() are guaranteed to be - * synchronizing against mapping->tree_lock. + * synchronizing against the i_pages lock. * - * Grabbing old_wb->list_lock, inode->i_lock and mapping->tree_lock + * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock * gives us exclusion against all wb related operations on @inode * including IO list manipulations and stat updates. */ @@ -361,7 +361,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); } spin_lock(&inode->i_lock); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); /* * Once I_FREEING is visible under i_lock, the eviction path owns @@ -373,22 +373,22 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) /* * Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to - * pages actually under underwriteback. + * pages actually under writeback. */ - radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0, + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0, PAGECACHE_TAG_DIRTY) { struct page *page = radix_tree_deref_slot_protected(slot, - &mapping->tree_lock); + &mapping->i_pages.xa_lock); if (likely(page) && PageDirty(page)) { dec_wb_stat(old_wb, WB_RECLAIMABLE); inc_wb_stat(new_wb, WB_RECLAIMABLE); } } - radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0, + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0, PAGECACHE_TAG_WRITEBACK) { struct page *page = radix_tree_deref_slot_protected(slot, - &mapping->tree_lock); + &mapping->i_pages.xa_lock); if (likely(page)) { WARN_ON_ONCE(!PageWriteback(page)); dec_wb_stat(old_wb, WB_WRITEBACK); @@ -430,7 +430,7 @@ skip_switch: */ smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); spin_unlock(&inode->i_lock); spin_unlock(&new_wb->list_lock); spin_unlock(&old_wb->list_lock); @@ -506,8 +506,8 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) /* * In addition to synchronizing among switchers, I_WB_SWITCH tells - * the RCU protected stat update paths to grab the mapping's - * tree_lock so that stat transfer can synchronize against them. + * the RCU protected stat update paths to grab the i_page + * lock so that stat transfer can synchronize against them. * Let's continue after I_WB_SWITCH is guaranteed to be visible. */ call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); @@ -1343,7 +1343,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) dirty = inode->i_state & I_DIRTY; if (inode->i_state & I_DIRTY_TIME) { - if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) || + if ((dirty & I_DIRTY_INODE) || wbc->sync_mode == WB_SYNC_ALL || unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) || unlikely(time_after(jiffies, @@ -2112,7 +2112,6 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode) */ void __mark_inode_dirty(struct inode *inode, int flags) { -#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) struct super_block *sb = inode->i_sb; int dirtytime; @@ -2122,7 +2121,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) * Don't do this for I_DIRTY_PAGES - that doesn't actually * dirty the inode itself */ - if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_TIME)) { + if (flags & (I_DIRTY_INODE | I_DIRTY_TIME)) { trace_writeback_dirty_inode_start(inode, flags); if (sb->s_op->dirty_inode) @@ -2197,7 +2196,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) if (dirtytime) inode->dirtied_time_when = jiffies; - if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES)) + if (inode->i_state & I_DIRTY) dirty_list = &wb->b_dirty; else dirty_list = &wb->b_dirty_time; @@ -2221,8 +2220,6 @@ void __mark_inode_dirty(struct inode *inode, int flags) } out_unlock_inode: spin_unlock(&inode->i_lock); - -#undef I_DIRTY_INODE } EXPORT_SYMBOL(__mark_inode_dirty); diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c index 56cce7fdd39e..c184c5a356ff 100644 --- a/fs/fscache/cache.c +++ b/fs/fscache/cache.c @@ -125,7 +125,7 @@ struct fscache_cache *fscache_select_cache_for_object( } /* the parent is unbacked */ - if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) { + if (cookie->type != FSCACHE_COOKIE_TYPE_INDEX) { /* cookie not an index and is unbacked */ spin_unlock(&cookie->lock); _leave(" = NULL [cookie ub,ni]"); diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index d705125665f0..97137d7ec5ee 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -21,12 +21,54 @@ struct kmem_cache *fscache_cookie_jar; static atomic_t fscache_object_debug_id = ATOMIC_INIT(0); -static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie); +#define fscache_cookie_hash_shift 15 +static struct hlist_bl_head fscache_cookie_hash[1 << fscache_cookie_hash_shift]; + +static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie, + loff_t object_size); static int fscache_alloc_object(struct fscache_cache *cache, struct fscache_cookie *cookie); static int fscache_attach_object(struct fscache_cookie *cookie, struct fscache_object *object); +static void fscache_print_cookie(struct fscache_cookie *cookie, char prefix) +{ + struct hlist_node *object; + const u8 *k; + unsigned loop; + + pr_err("%c-cookie c=%p [p=%p fl=%lx nc=%u na=%u]\n", + prefix, cookie, cookie->parent, cookie->flags, + atomic_read(&cookie->n_children), + atomic_read(&cookie->n_active)); + pr_err("%c-cookie d=%p n=%p\n", + prefix, cookie->def, cookie->netfs_data); + + object = READ_ONCE(cookie->backing_objects.first); + if (object) + pr_err("%c-cookie o=%p\n", + prefix, hlist_entry(object, struct fscache_object, cookie_link)); + + pr_err("%c-key=[%u] '", prefix, cookie->key_len); + k = (cookie->key_len <= sizeof(cookie->inline_key)) ? + cookie->inline_key : cookie->key; + for (loop = 0; loop < cookie->key_len; loop++) + pr_cont("%02x", k[loop]); + pr_cont("'\n"); +} + +void fscache_free_cookie(struct fscache_cookie *cookie) +{ + if (cookie) { + BUG_ON(!hlist_empty(&cookie->backing_objects)); + if (cookie->aux_len > sizeof(cookie->inline_aux)) + kfree(cookie->aux); + if (cookie->key_len > sizeof(cookie->inline_key)) + kfree(cookie->key); + kmem_cache_free(fscache_cookie_jar, cookie); + } +} + /* * initialise an cookie jar slab element prior to any use */ @@ -41,6 +83,170 @@ void fscache_cookie_init_once(void *_cookie) } /* + * Set the index key in a cookie. The cookie struct has space for a 12-byte + * key plus length and hash, but if that's not big enough, it's instead a + * pointer to a buffer containing 3 bytes of hash, 1 byte of length and then + * the key data. + */ +static int fscache_set_key(struct fscache_cookie *cookie, + const void *index_key, size_t index_key_len) +{ + unsigned long long h; + u32 *buf; + int i; + + cookie->key_len = index_key_len; + + if (index_key_len > sizeof(cookie->inline_key)) { + buf = kzalloc(index_key_len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + cookie->key = buf; + } else { + buf = (u32 *)cookie->inline_key; + buf[0] = 0; + buf[1] = 0; + buf[2] = 0; + } + + memcpy(buf, index_key, index_key_len); + + /* Calculate a hash and combine this with the length in the first word + * or first half word + */ + h = (unsigned long)cookie->parent; + h += index_key_len + cookie->type; + for (i = 0; i < (index_key_len + sizeof(u32) - 1) / sizeof(u32); i++) + h += buf[i]; + + cookie->key_hash = h ^ (h >> 32); + return 0; +} + +static long fscache_compare_cookie(const struct fscache_cookie *a, + const struct fscache_cookie *b) +{ + const void *ka, *kb; + + if (a->key_hash != b->key_hash) + return (long)a->key_hash - (long)b->key_hash; + if (a->parent != b->parent) + return (long)a->parent - (long)b->parent; + if (a->key_len != b->key_len) + return (long)a->key_len - (long)b->key_len; + if (a->type != b->type) + return (long)a->type - (long)b->type; + + if (a->key_len <= sizeof(a->inline_key)) { + ka = &a->inline_key; + kb = &b->inline_key; + } else { + ka = a->key; + kb = b->key; + } + return memcmp(ka, kb, a->key_len); +} + +/* + * Allocate a cookie. + */ +struct fscache_cookie *fscache_alloc_cookie( + struct fscache_cookie *parent, + const struct fscache_cookie_def *def, + const void *index_key, size_t index_key_len, + const void *aux_data, size_t aux_data_len, + void *netfs_data, + loff_t object_size) +{ + struct fscache_cookie *cookie; + + /* allocate and initialise a cookie */ + cookie = kmem_cache_alloc(fscache_cookie_jar, GFP_KERNEL); + if (!cookie) + return NULL; + + cookie->key_len = index_key_len; + cookie->aux_len = aux_data_len; + + if (fscache_set_key(cookie, index_key, index_key_len) < 0) + goto nomem; + + if (cookie->aux_len <= sizeof(cookie->inline_aux)) { + memcpy(cookie->inline_aux, aux_data, cookie->aux_len); + } else { + cookie->aux = kmemdup(aux_data, cookie->aux_len, GFP_KERNEL); + if (!cookie->aux) + goto nomem; + } + + atomic_set(&cookie->usage, 1); + atomic_set(&cookie->n_children, 0); + + /* We keep the active count elevated until relinquishment to prevent an + * attempt to wake up every time the object operations queue quiesces. + */ + atomic_set(&cookie->n_active, 1); + + cookie->def = def; + cookie->parent = parent; + cookie->netfs_data = netfs_data; + cookie->flags = (1 << FSCACHE_COOKIE_NO_DATA_YET); + cookie->type = def->type; + + /* radix tree insertion won't use the preallocation pool unless it's + * told it may not wait */ + INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); + return cookie; + +nomem: + fscache_free_cookie(cookie); + return NULL; +} + +/* + * Attempt to insert the new cookie into the hash. If there's a collision, we + * return the old cookie if it's not in use and an error otherwise. + */ +struct fscache_cookie *fscache_hash_cookie(struct fscache_cookie *candidate) +{ + struct fscache_cookie *cursor; + struct hlist_bl_head *h; + struct hlist_bl_node *p; + unsigned int bucket; + + bucket = candidate->key_hash & (ARRAY_SIZE(fscache_cookie_hash) - 1); + h = &fscache_cookie_hash[bucket]; + + hlist_bl_lock(h); + hlist_bl_for_each_entry(cursor, p, h, hash_link) { + if (fscache_compare_cookie(candidate, cursor) == 0) + goto collision; + } + + __set_bit(FSCACHE_COOKIE_ACQUIRED, &candidate->flags); + fscache_cookie_get(candidate->parent, fscache_cookie_get_acquire_parent); + atomic_inc(&candidate->parent->n_children); + hlist_bl_add_head(&candidate->hash_link, h); + hlist_bl_unlock(h); + return candidate; + +collision: + if (test_and_set_bit(FSCACHE_COOKIE_ACQUIRED, &cursor->flags)) { + trace_fscache_cookie(cursor, fscache_cookie_collision, + atomic_read(&cursor->usage)); + pr_err("Duplicate cookie detected\n"); + fscache_print_cookie(cursor, 'O'); + fscache_print_cookie(candidate, 'N'); + hlist_bl_unlock(h); + return NULL; + } + + fscache_cookie_get(cursor, fscache_cookie_get_reacquire); + hlist_bl_unlock(h); + return cursor; +} + +/* * request a cookie to represent an object (index, datafile, xattr, etc) * - parent specifies the parent object * - the top level index cookie for each netfs is stored in the fscache_netfs @@ -58,10 +264,13 @@ void fscache_cookie_init_once(void *_cookie) struct fscache_cookie *__fscache_acquire_cookie( struct fscache_cookie *parent, const struct fscache_cookie_def *def, + const void *index_key, size_t index_key_len, + const void *aux_data, size_t aux_data_len, void *netfs_data, + loff_t object_size, bool enable) { - struct fscache_cookie *cookie; + struct fscache_cookie *candidate, *cookie; BUG_ON(!def); @@ -69,6 +278,13 @@ struct fscache_cookie *__fscache_acquire_cookie( parent ? (char *) parent->def->name : "<no-parent>", def->name, netfs_data, enable); + if (!index_key || !index_key_len || index_key_len > 255 || aux_data_len > 255) + return NULL; + if (!aux_data || !aux_data_len) { + aux_data = NULL; + aux_data_len = 0; + } + fscache_stat(&fscache_n_acquires); /* if there's no parent cookie, then we don't create one here either */ @@ -79,41 +295,31 @@ struct fscache_cookie *__fscache_acquire_cookie( } /* validate the definition */ - BUG_ON(!def->get_key); BUG_ON(!def->name[0]); BUG_ON(def->type == FSCACHE_COOKIE_TYPE_INDEX && - parent->def->type != FSCACHE_COOKIE_TYPE_INDEX); + parent->type != FSCACHE_COOKIE_TYPE_INDEX); - /* allocate and initialise a cookie */ - cookie = kmem_cache_alloc(fscache_cookie_jar, GFP_KERNEL); - if (!cookie) { + candidate = fscache_alloc_cookie(parent, def, + index_key, index_key_len, + aux_data, aux_data_len, + netfs_data, object_size); + if (!candidate) { fscache_stat(&fscache_n_acquires_oom); _leave(" [ENOMEM]"); return NULL; } - atomic_set(&cookie->usage, 1); - atomic_set(&cookie->n_children, 0); - - /* We keep the active count elevated until relinquishment to prevent an - * attempt to wake up every time the object operations queue quiesces. - */ - atomic_set(&cookie->n_active, 1); - - atomic_inc(&parent->usage); - atomic_inc(&parent->n_children); + cookie = fscache_hash_cookie(candidate); + if (!cookie) { + trace_fscache_cookie(candidate, fscache_cookie_discard, 1); + goto out; + } - cookie->def = def; - cookie->parent = parent; - cookie->netfs_data = netfs_data; - cookie->flags = (1 << FSCACHE_COOKIE_NO_DATA_YET); + if (cookie == candidate) + candidate = NULL; - /* radix tree insertion won't use the preallocation pool unless it's - * told it may not wait */ - INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); - - switch (cookie->def->type) { + switch (cookie->type) { case FSCACHE_COOKIE_TYPE_INDEX: fscache_stat(&fscache_n_cookie_index); break; @@ -125,16 +331,19 @@ struct fscache_cookie *__fscache_acquire_cookie( break; } + trace_fscache_acquire(cookie); + if (enable) { /* if the object is an index then we need do nothing more here * - we create indices on disk when we need them as an index * may exist in multiple caches */ - if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) { - if (fscache_acquire_non_index_cookie(cookie) == 0) { + if (cookie->type != FSCACHE_COOKIE_TYPE_INDEX) { + if (fscache_acquire_non_index_cookie(cookie, object_size) == 0) { set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); } else { atomic_dec(&parent->n_children); - __fscache_cookie_put(cookie); + fscache_cookie_put(cookie, + fscache_cookie_put_acquire_nobufs); fscache_stat(&fscache_n_acquires_nobufs); _leave(" = NULL"); return NULL; @@ -145,7 +354,9 @@ struct fscache_cookie *__fscache_acquire_cookie( } fscache_stat(&fscache_n_acquires_ok); - _leave(" = %p", cookie); + +out: + fscache_free_cookie(candidate); return cookie; } EXPORT_SYMBOL(__fscache_acquire_cookie); @@ -154,24 +365,30 @@ EXPORT_SYMBOL(__fscache_acquire_cookie); * Enable a cookie to permit it to accept new operations. */ void __fscache_enable_cookie(struct fscache_cookie *cookie, + const void *aux_data, + loff_t object_size, bool (*can_enable)(void *data), void *data) { _enter("%p", cookie); + trace_fscache_enable(cookie); + wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK, TASK_UNINTERRUPTIBLE); + fscache_update_aux(cookie, aux_data); + if (test_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags)) goto out_unlock; if (can_enable && !can_enable(data)) { /* The netfs decided it didn't want to enable after all */ - } else if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) { + } else if (cookie->type != FSCACHE_COOKIE_TYPE_INDEX) { /* Wait for outstanding disablement to complete */ __fscache_wait_on_invalidate(cookie); - if (fscache_acquire_non_index_cookie(cookie) == 0) + if (fscache_acquire_non_index_cookie(cookie, object_size) == 0) set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); } else { set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); @@ -188,11 +405,11 @@ EXPORT_SYMBOL(__fscache_enable_cookie); * - this must make sure the index chain is instantiated and instantiate the * object representation too */ -static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie) +static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie, + loff_t object_size) { struct fscache_object *object; struct fscache_cache *cache; - uint64_t i_size; int ret; _enter(""); @@ -231,9 +448,6 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie) return ret; } - /* pass on how big the object we're caching is supposed to be */ - cookie->def->get_attr(cookie->netfs_data, &i_size); - spin_lock(&cookie->lock); if (hlist_empty(&cookie->backing_objects)) { spin_unlock(&cookie->lock); @@ -243,7 +457,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie) object = hlist_entry(cookie->backing_objects.first, struct fscache_object, cookie_link); - fscache_set_store_limit(object, i_size); + fscache_set_store_limit(object, object_size); /* initiate the process of looking up all the objects in the chain * (done by fscache_initialise_object()) */ @@ -318,7 +532,7 @@ static int fscache_alloc_object(struct fscache_cache *cache, * attached to the cookie */ if (fscache_attach_object(cookie, object) < 0) { fscache_stat(&fscache_n_cop_put_object); - cache->ops->put_object(object); + cache->ops->put_object(object, fscache_obj_put_attach_fail); fscache_stat_d(&fscache_n_cop_put_object); } @@ -338,7 +552,7 @@ object_already_extant: error_put: fscache_stat(&fscache_n_cop_put_object); - cache->ops->put_object(object); + cache->ops->put_object(object, fscache_obj_put_alloc_fail); fscache_stat_d(&fscache_n_cop_put_object); error: _leave(" = %d", ret); @@ -398,7 +612,7 @@ static int fscache_attach_object(struct fscache_cookie *cookie, /* attach to the cookie */ object->cookie = cookie; - atomic_inc(&cookie->usage); + fscache_cookie_get(cookie, fscache_cookie_get_attach_object); hlist_add_head(&object->cookie_link, &cookie->backing_objects); fscache_objlist_add(object); @@ -426,10 +640,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie) * there, and if it's doing that, it may as well just retire the * cookie. */ - ASSERTCMP(cookie->def->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE); - - /* We will be updating the cookie too. */ - BUG_ON(!cookie->def->get_aux); + ASSERTCMP(cookie->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE); /* If there's an object, we tell the object state machine to handle the * invalidation on our behalf, otherwise there's nothing to do. @@ -473,7 +684,7 @@ EXPORT_SYMBOL(__fscache_wait_on_invalidate); /* * update the index entries backing a cookie */ -void __fscache_update_cookie(struct fscache_cookie *cookie) +void __fscache_update_cookie(struct fscache_cookie *cookie, const void *aux_data) { struct fscache_object *object; @@ -487,10 +698,10 @@ void __fscache_update_cookie(struct fscache_cookie *cookie) _enter("{%s}", cookie->def->name); - BUG_ON(!cookie->def->get_aux); - spin_lock(&cookie->lock); + fscache_update_aux(cookie, aux_data); + if (fscache_cookie_enabled(cookie)) { /* update the index entry on disk in each cache backing this * cookie. @@ -509,13 +720,17 @@ EXPORT_SYMBOL(__fscache_update_cookie); /* * Disable a cookie to stop it from accepting new requests from the netfs. */ -void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate) +void __fscache_disable_cookie(struct fscache_cookie *cookie, + const void *aux_data, + bool invalidate) { struct fscache_object *object; bool awaken = false; _enter("%p,%u", cookie, invalidate); + trace_fscache_disable(cookie); + ASSERTCMP(atomic_read(&cookie->n_active), >, 0); if (atomic_read(&cookie->n_children) != 0) { @@ -526,6 +741,9 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate) wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK, TASK_UNINTERRUPTIBLE); + + fscache_update_aux(cookie, aux_data); + if (!test_and_clear_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags)) goto out_unlock_enable; @@ -563,7 +781,7 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate) } /* Make sure any pending writes are cancelled. */ - if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) + if (cookie->type != FSCACHE_COOKIE_TYPE_INDEX) fscache_invalidate_writes(cookie); /* Reset the cookie state if it wasn't relinquished */ @@ -585,7 +803,9 @@ EXPORT_SYMBOL(__fscache_disable_cookie); * - all dependents of this cookie must have already been unregistered * (indices/files/pages) */ -void __fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire) +void __fscache_relinquish_cookie(struct fscache_cookie *cookie, + const void *aux_data, + bool retire) { fscache_stat(&fscache_n_relinquishes); if (retire) @@ -601,15 +821,18 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire) cookie, cookie->def->name, cookie->netfs_data, atomic_read(&cookie->n_active), retire); + trace_fscache_relinquish(cookie, retire); + /* No further netfs-accessing operations on this cookie permitted */ - set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags); + if (test_and_set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags)) + BUG(); - __fscache_disable_cookie(cookie, retire); + __fscache_disable_cookie(cookie, aux_data, retire); /* Clear pointers back to the netfs */ cookie->netfs_data = NULL; cookie->def = NULL; - BUG_ON(cookie->stores.rnode); + BUG_ON(!radix_tree_empty(&cookie->stores)); if (cookie->parent) { ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0); @@ -619,35 +842,54 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire) /* Dispose of the netfs's link to the cookie */ ASSERTCMP(atomic_read(&cookie->usage), >, 0); - fscache_cookie_put(cookie); + fscache_cookie_put(cookie, fscache_cookie_put_relinquish); _leave(""); } EXPORT_SYMBOL(__fscache_relinquish_cookie); /* - * destroy a cookie + * Remove a cookie from the hash table. */ -void __fscache_cookie_put(struct fscache_cookie *cookie) +static void fscache_unhash_cookie(struct fscache_cookie *cookie) +{ + struct hlist_bl_head *h; + unsigned int bucket; + + bucket = cookie->key_hash & (ARRAY_SIZE(fscache_cookie_hash) - 1); + h = &fscache_cookie_hash[bucket]; + + hlist_bl_lock(h); + hlist_bl_del(&cookie->hash_link); + hlist_bl_unlock(h); +} + +/* + * Drop a reference to a cookie. + */ +void fscache_cookie_put(struct fscache_cookie *cookie, + enum fscache_cookie_trace where) { struct fscache_cookie *parent; + int usage; _enter("%p", cookie); - for (;;) { - _debug("FREE COOKIE %p", cookie); - parent = cookie->parent; - BUG_ON(!hlist_empty(&cookie->backing_objects)); - kmem_cache_free(fscache_cookie_jar, cookie); + do { + usage = atomic_dec_return(&cookie->usage); + trace_fscache_cookie(cookie, where, usage); - if (!parent) - break; + if (usage > 0) + return; + BUG_ON(usage < 0); + + parent = cookie->parent; + fscache_unhash_cookie(cookie); + fscache_free_cookie(cookie); cookie = parent; - BUG_ON(atomic_read(&cookie->usage) <= 0); - if (!atomic_dec_and_test(&cookie->usage)) - break; - } + where = fscache_cookie_put_parent; + } while (cookie); _leave(""); } @@ -657,7 +899,8 @@ void __fscache_cookie_put(struct fscache_cookie *cookie) * * NOTE: it only serves no-index type */ -int __fscache_check_consistency(struct fscache_cookie *cookie) +int __fscache_check_consistency(struct fscache_cookie *cookie, + const void *aux_data) { struct fscache_operation *op; struct fscache_object *object; @@ -666,7 +909,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie) _enter("%p,", cookie); - ASSERTCMP(cookie->def->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE); + ASSERTCMP(cookie->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE); if (fscache_wait_for_deferred_lookup(cookie) < 0) return -ERESTARTSYS; @@ -678,13 +921,16 @@ int __fscache_check_consistency(struct fscache_cookie *cookie) if (!op) return -ENOMEM; - fscache_operation_init(op, NULL, NULL, NULL); + fscache_operation_init(cookie, op, NULL, NULL, NULL); op->flags = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING) | (1 << FSCACHE_OP_UNUSE_COOKIE); + trace_fscache_page_op(cookie, NULL, op, fscache_page_op_check_consistency); spin_lock(&cookie->lock); + fscache_update_aux(cookie, aux_data); + if (!fscache_cookie_enabled(cookie) || hlist_empty(&cookie->backing_objects)) goto inconsistent; diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c index 5a117df2a9ef..aa46e48d8c75 100644 --- a/fs/fscache/fsdef.c +++ b/fs/fscache/fsdef.c @@ -13,16 +13,11 @@ #include <linux/module.h> #include "internal.h" -static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax); - -static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax); - static enum fscache_checkaux fscache_fsdef_netfs_check_aux(void *cookie_netfs_data, const void *data, - uint16_t datalen); + uint16_t datalen, + loff_t object_size); /* * The root index is owned by FS-Cache itself. @@ -60,6 +55,7 @@ struct fscache_cookie fscache_fsdef_index = { .backing_objects = HLIST_HEAD_INIT, .def = &fscache_fsdef_index_def, .flags = 1 << FSCACHE_COOKIE_ENABLED, + .type = FSCACHE_COOKIE_TYPE_INDEX, }; EXPORT_SYMBOL(fscache_fsdef_index); @@ -71,59 +67,18 @@ EXPORT_SYMBOL(fscache_fsdef_index); struct fscache_cookie_def fscache_fsdef_netfs_def = { .name = "FSDEF.netfs", .type = FSCACHE_COOKIE_TYPE_INDEX, - .get_key = fscache_fsdef_netfs_get_key, - .get_aux = fscache_fsdef_netfs_get_aux, .check_aux = fscache_fsdef_netfs_check_aux, }; /* - * get the key data for an FSDEF index record - this is the name of the netfs - * for which this entry is created - */ -static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - const struct fscache_netfs *netfs = cookie_netfs_data; - unsigned klen; - - _enter("{%s.%u},", netfs->name, netfs->version); - - klen = strlen(netfs->name); - if (klen > bufmax) - return 0; - - memcpy(buffer, netfs->name, klen); - return klen; -} - -/* - * get the auxiliary data for an FSDEF index record - this is the index - * structure version number of the netfs for which this version is created - */ -static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - const struct fscache_netfs *netfs = cookie_netfs_data; - unsigned dlen; - - _enter("{%s.%u},", netfs->name, netfs->version); - - dlen = sizeof(uint32_t); - if (dlen > bufmax) - return 0; - - memcpy(buffer, &netfs->version, dlen); - return dlen; -} - -/* * check that the index structure version number stored in the auxiliary data * matches the one the netfs gave us */ static enum fscache_checkaux fscache_fsdef_netfs_check_aux( void *cookie_netfs_data, const void *data, - uint16_t datalen) + uint16_t datalen, + loff_t object_size) { struct fscache_netfs *netfs = cookie_netfs_data; uint32_t version; diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 0ff4b49a0037..500650f938fe 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -29,6 +29,7 @@ #define pr_fmt(fmt) "FS-Cache: " fmt #include <linux/fscache-cache.h> +#include <trace/events/fscache.h> #include <linux/sched.h> #define FSCACHE_MIN_THREADS 4 @@ -48,8 +49,16 @@ extern struct fscache_cache *fscache_select_cache_for_object( */ extern struct kmem_cache *fscache_cookie_jar; +extern void fscache_free_cookie(struct fscache_cookie *); extern void fscache_cookie_init_once(void *); -extern void __fscache_cookie_put(struct fscache_cookie *); +extern struct fscache_cookie *fscache_alloc_cookie(struct fscache_cookie *, + const struct fscache_cookie_def *, + const void *, size_t, + const void *, size_t, + void *, loff_t); +extern struct fscache_cookie *fscache_hash_cookie(struct fscache_cookie *); +extern void fscache_cookie_put(struct fscache_cookie *, + enum fscache_cookie_trace); /* * fsdef.c @@ -311,14 +320,12 @@ static inline void fscache_raise_event(struct fscache_object *object, fscache_enqueue_object(object); } -/* - * drop a reference to a cookie - */ -static inline void fscache_cookie_put(struct fscache_cookie *cookie) +static inline void fscache_cookie_get(struct fscache_cookie *cookie, + enum fscache_cookie_trace where) { - BUG_ON(atomic_read(&cookie->usage) <= 0); - if (atomic_dec_and_test(&cookie->usage)) - __fscache_cookie_put(cookie); + int usage = atomic_inc_return(&cookie->usage); + + trace_fscache_cookie(cookie, where, usage); } /* @@ -342,6 +349,27 @@ void fscache_put_context(struct fscache_cookie *cookie, void *context) cookie->def->put_context(cookie->netfs_data, context); } +/* + * Update the auxiliary data on a cookie. + */ +static inline +void fscache_update_aux(struct fscache_cookie *cookie, const void *aux_data) +{ + void *p; + + if (!aux_data) + return; + if (cookie->aux_len <= sizeof(cookie->inline_aux)) + p = cookie->inline_aux; + else + p = cookie->aux; + + if (memcmp(p, aux_data, cookie->aux_len) != 0) { + memcpy(p, aux_data, cookie->aux_len); + set_bit(FSCACHE_COOKIE_AUX_UPDATED, &cookie->flags); + } +} + /*****************************************************************************/ /* * debug tracing diff --git a/fs/fscache/main.c b/fs/fscache/main.c index 249968dcbf5c..7dce110bf17d 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -16,6 +16,7 @@ #include <linux/completion.h> #include <linux/slab.h> #include <linux/seq_file.h> +#define CREATE_TRACE_POINTS #include "internal.h" MODULE_DESCRIPTION("FS Cache Manager"); diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c index a8aa00be4444..c2f605483cc5 100644 --- a/fs/fscache/netfs.c +++ b/fs/fscache/netfs.c @@ -14,69 +14,51 @@ #include <linux/slab.h> #include "internal.h" -static LIST_HEAD(fscache_netfs_list); - /* * register a network filesystem for caching */ int __fscache_register_netfs(struct fscache_netfs *netfs) { - struct fscache_netfs *ptr; - struct fscache_cookie *cookie; - int ret; + struct fscache_cookie *candidate, *cookie; _enter("{%s}", netfs->name); - INIT_LIST_HEAD(&netfs->link); - /* allocate a cookie for the primary index */ - cookie = kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL); - - if (!cookie) { + candidate = fscache_alloc_cookie(&fscache_fsdef_index, + &fscache_fsdef_netfs_def, + netfs->name, strlen(netfs->name), + &netfs->version, sizeof(netfs->version), + netfs, 0); + if (!candidate) { _leave(" = -ENOMEM"); return -ENOMEM; } - /* initialise the primary index cookie */ - atomic_set(&cookie->usage, 1); - atomic_set(&cookie->n_children, 0); - atomic_set(&cookie->n_active, 1); - - cookie->def = &fscache_fsdef_netfs_def; - cookie->parent = &fscache_fsdef_index; - cookie->netfs_data = netfs; - cookie->flags = 1 << FSCACHE_COOKIE_ENABLED; - - spin_lock_init(&cookie->lock); - spin_lock_init(&cookie->stores_lock); - INIT_HLIST_HEAD(&cookie->backing_objects); + candidate->flags = 1 << FSCACHE_COOKIE_ENABLED; /* check the netfs type is not already present */ - down_write(&fscache_addremove_sem); - - ret = -EEXIST; - list_for_each_entry(ptr, &fscache_netfs_list, link) { - if (strcmp(ptr->name, netfs->name) == 0) - goto already_registered; + cookie = fscache_hash_cookie(candidate); + if (!cookie) + goto already_registered; + if (cookie != candidate) { + trace_fscache_cookie(candidate, fscache_cookie_discard, 1); + fscache_free_cookie(candidate); } - atomic_inc(&cookie->parent->usage); + fscache_cookie_get(cookie->parent, fscache_cookie_get_register_netfs); atomic_inc(&cookie->parent->n_children); netfs->primary_index = cookie; - list_add(&netfs->link, &fscache_netfs_list); - ret = 0; pr_notice("Netfs '%s' registered for caching\n", netfs->name); + trace_fscache_netfs(netfs); + _leave(" = 0"); + return 0; already_registered: - up_write(&fscache_addremove_sem); - - if (ret < 0) - kmem_cache_free(fscache_cookie_jar, cookie); - - _leave(" = %d", ret); - return ret; + fscache_cookie_put(candidate, fscache_cookie_put_dup_netfs); + _leave(" = -EEXIST"); + return -EEXIST; } EXPORT_SYMBOL(__fscache_register_netfs); @@ -88,15 +70,8 @@ void __fscache_unregister_netfs(struct fscache_netfs *netfs) { _enter("{%s.%u}", netfs->name, netfs->version); - down_write(&fscache_addremove_sem); - - list_del(&netfs->link); - fscache_relinquish_cookie(netfs->primary_index, 0); - - up_write(&fscache_addremove_sem); - - pr_notice("Netfs '%s' unregistered from caching\n", - netfs->name); + fscache_relinquish_cookie(netfs->primary_index, NULL, false); + pr_notice("Netfs '%s' unregistered from caching\n", netfs->name); _leave(""); } diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index 0438d4cd91ef..43e6e28c164f 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -36,8 +36,6 @@ struct fscache_objlist_data { #define FSCACHE_OBJLIST_CONFIG_NOEVENTS 0x00000800 /* show objects without no events */ #define FSCACHE_OBJLIST_CONFIG_WORK 0x00001000 /* show objects with work */ #define FSCACHE_OBJLIST_CONFIG_NOWORK 0x00002000 /* show objects without work */ - - u8 buf[512]; /* key and aux data buffer */ }; /* @@ -170,7 +168,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v) struct fscache_cookie *cookie; unsigned long config = data->config; char _type[3], *type; - u8 *buf = data->buf, *p; + u8 *p; if ((unsigned long) v == 1) { seq_puts(m, "OBJECT PARENT STAT CHLDN OPS OOP IPR EX READS" @@ -254,7 +252,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v) if (fscache_use_cookie(obj)) { uint16_t keylen = 0, auxlen = 0; - switch (cookie->def->type) { + switch (cookie->type) { case 0: type = "IX"; break; @@ -263,7 +261,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v) break; default: snprintf(_type, sizeof(_type), "%02u", - cookie->def->type); + cookie->type); type = _type; break; } @@ -274,30 +272,30 @@ static int fscache_objlist_show(struct seq_file *m, void *v) cookie->flags, cookie->netfs_data); - if (cookie->def->get_key && - config & FSCACHE_OBJLIST_CONFIG_KEY) - keylen = cookie->def->get_key(cookie->netfs_data, - buf, 400); + if (config & FSCACHE_OBJLIST_CONFIG_KEY) + keylen = cookie->key_len; - if (cookie->def->get_aux && - config & FSCACHE_OBJLIST_CONFIG_AUX) - auxlen = cookie->def->get_aux(cookie->netfs_data, - buf + keylen, 512 - keylen); - fscache_unuse_cookie(obj); + if (config & FSCACHE_OBJLIST_CONFIG_AUX) + auxlen = cookie->aux_len; if (keylen > 0 || auxlen > 0) { seq_puts(m, " "); - for (p = buf; keylen > 0; keylen--) + p = keylen <= sizeof(cookie->inline_key) ? + cookie->inline_key : cookie->key; + for (; keylen > 0; keylen--) seq_printf(m, "%02x", *p++); if (auxlen > 0) { if (config & FSCACHE_OBJLIST_CONFIG_KEY) seq_puts(m, ", "); + p = auxlen <= sizeof(cookie->inline_aux) ? + cookie->inline_aux : cookie->aux; for (; auxlen > 0; auxlen--) seq_printf(m, "%02x", *p++); } } seq_puts(m, "\n"); + fscache_unuse_cookie(obj); } else { seq_puts(m, "<no_netfs>\n"); } diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 7a182c87f378..20e0d0a4dc8c 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -138,10 +138,13 @@ static const struct fscache_transition fscache_osm_run_oob[] = { { 0, NULL } }; -static int fscache_get_object(struct fscache_object *); -static void fscache_put_object(struct fscache_object *); +static int fscache_get_object(struct fscache_object *, + enum fscache_obj_ref_trace); +static void fscache_put_object(struct fscache_object *, + enum fscache_obj_ref_trace); static bool fscache_enqueue_dependents(struct fscache_object *, int); static void fscache_dequeue_object(struct fscache_object *); +static void fscache_update_aux_data(struct fscache_object *); /* * we need to notify the parent when an op completes that we had outstanding @@ -170,6 +173,7 @@ static void fscache_object_sm_dispatcher(struct fscache_object *object) const struct fscache_transition *t; const struct fscache_state *state, *new_state; unsigned long events, event_mask; + bool oob; int event = -1; ASSERT(object != NULL); @@ -188,6 +192,7 @@ restart_masked: if (events & object->oob_event_mask) { _debug("{OBJ%x} oob %lx", object->debug_id, events & object->oob_event_mask); + oob = true; for (t = object->oob_table; t->events; t++) { if (events & t->events) { state = t->transit_to; @@ -199,6 +204,7 @@ restart_masked: } } } + oob = false; /* Wait states are just transition tables */ if (!state->work) { @@ -207,6 +213,8 @@ restart_masked: if (events & t->events) { new_state = t->transit_to; event = fls(events & t->events) - 1; + trace_fscache_osm(object, state, + true, false, event); clear_bit(event, &object->events); _debug("{OBJ%x} ev %d: %s -> %s", object->debug_id, event, @@ -226,6 +234,7 @@ restart_masked: execute_work_state: _debug("{OBJ%x} exec %s", object->debug_id, state->name); + trace_fscache_osm(object, state, false, oob, event); new_state = state->work(object, event); event = -1; if (new_state == NO_TRANSIT) { @@ -279,7 +288,7 @@ static void fscache_object_work_func(struct work_struct *work) start = jiffies; fscache_object_sm_dispatcher(object); fscache_hist(fscache_objs_histogram, start); - fscache_put_object(object); + fscache_put_object(object, fscache_obj_put_work); } /** @@ -397,7 +406,7 @@ static const struct fscache_state *fscache_initialise_object(struct fscache_obje fscache_stat(&fscache_n_cop_grab_object); success = false; if (fscache_object_is_live(parent) && - object->cache->ops->grab_object(object)) { + object->cache->ops->grab_object(object, fscache_obj_get_add_to_deps)) { list_add(&object->dep_link, &parent->dependents); success = true; } @@ -703,6 +712,11 @@ static const struct fscache_state *fscache_drop_object(struct fscache_object *ob ASSERT(cookie != NULL); ASSERT(!hlist_unhashed(&object->cookie_link)); + if (test_bit(FSCACHE_COOKIE_AUX_UPDATED, &cookie->flags)) { + _debug("final update"); + fscache_update_aux_data(object); + } + /* Make sure the cookie no longer points here and that the netfs isn't * waiting for us. */ @@ -745,7 +759,7 @@ static const struct fscache_state *fscache_drop_object(struct fscache_object *ob } /* this just shifts the object release to the work processor */ - fscache_put_object(object); + fscache_put_object(object, fscache_obj_put_drop_obj); fscache_stat(&fscache_n_object_dead); _leave(""); @@ -755,12 +769,13 @@ static const struct fscache_state *fscache_drop_object(struct fscache_object *ob /* * get a ref on an object */ -static int fscache_get_object(struct fscache_object *object) +static int fscache_get_object(struct fscache_object *object, + enum fscache_obj_ref_trace why) { int ret; fscache_stat(&fscache_n_cop_grab_object); - ret = object->cache->ops->grab_object(object) ? 0 : -EAGAIN; + ret = object->cache->ops->grab_object(object, why) ? 0 : -EAGAIN; fscache_stat_d(&fscache_n_cop_grab_object); return ret; } @@ -768,10 +783,11 @@ static int fscache_get_object(struct fscache_object *object) /* * Discard a ref on an object */ -static void fscache_put_object(struct fscache_object *object) +static void fscache_put_object(struct fscache_object *object, + enum fscache_obj_ref_trace why) { fscache_stat(&fscache_n_cop_put_object); - object->cache->ops->put_object(object); + object->cache->ops->put_object(object, why); fscache_stat_d(&fscache_n_cop_put_object); } @@ -786,7 +802,7 @@ void fscache_object_destroy(struct fscache_object *object) fscache_objlist_remove(object); /* We can get rid of the cookie now */ - fscache_cookie_put(object->cookie); + fscache_cookie_put(object->cookie, fscache_cookie_put_object); object->cookie = NULL; } EXPORT_SYMBOL(fscache_object_destroy); @@ -798,7 +814,7 @@ void fscache_enqueue_object(struct fscache_object *object) { _enter("{OBJ%x}", object->debug_id); - if (fscache_get_object(object) >= 0) { + if (fscache_get_object(object, fscache_obj_get_queue) >= 0) { wait_queue_head_t *cong_wq = &get_cpu_var(fscache_object_cong_wait); @@ -806,7 +822,7 @@ void fscache_enqueue_object(struct fscache_object *object) if (fscache_object_congested()) wake_up(cong_wq); } else - fscache_put_object(object); + fscache_put_object(object, fscache_obj_put_queue); put_cpu_var(fscache_object_cong_wait); } @@ -866,7 +882,7 @@ static bool fscache_enqueue_dependents(struct fscache_object *object, int event) list_del_init(&dep->dep_link); fscache_raise_event(dep, event); - fscache_put_object(dep); + fscache_put_object(dep, fscache_obj_put_enq_dep); if (!list_empty(&object->dependents) && need_resched()) { ret = false; @@ -906,7 +922,8 @@ static void fscache_dequeue_object(struct fscache_object *object) * and creation). */ enum fscache_checkaux fscache_check_aux(struct fscache_object *object, - const void *data, uint16_t datalen) + const void *data, uint16_t datalen, + loff_t object_size) { enum fscache_checkaux result; @@ -916,7 +933,7 @@ enum fscache_checkaux fscache_check_aux(struct fscache_object *object, } result = object->cookie->def->check_aux(object->cookie->netfs_data, - data, datalen); + data, datalen, object_size); switch (result) { /* entry okay as is */ case FSCACHE_CHECKAUX_OKAY: @@ -956,7 +973,7 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj * retire the object instead. */ if (!fscache_use_cookie(object)) { - ASSERT(object->cookie->stores.rnode == NULL); + ASSERT(radix_tree_empty(&object->cookie->stores)); set_bit(FSCACHE_OBJECT_RETIRED, &object->flags); _leave(" [no cookie]"); return transit_to(KILL_OBJECT); @@ -972,11 +989,12 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj if (!op) goto nomem; - fscache_operation_init(op, object->cache->ops->invalidate_object, + fscache_operation_init(cookie, op, object->cache->ops->invalidate_object, NULL, NULL); op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE) | (1 << FSCACHE_OP_UNUSE_COOKIE); + trace_fscache_page_op(cookie, NULL, op, fscache_page_op_invalidate); spin_lock(&cookie->lock); if (fscache_submit_exclusive_op(object, op) < 0) @@ -1026,6 +1044,17 @@ static const struct fscache_state *fscache_invalidate_object(struct fscache_obje } /* + * Update auxiliary data. + */ +static void fscache_update_aux_data(struct fscache_object *object) +{ + fscache_stat(&fscache_n_updates_run); + fscache_stat(&fscache_n_cop_update_object); + object->cache->ops->update_object(object); + fscache_stat_d(&fscache_n_cop_update_object); +} + +/* * Asynchronously update an object. */ static const struct fscache_state *fscache_update_object(struct fscache_object *object, @@ -1033,10 +1062,7 @@ static const struct fscache_state *fscache_update_object(struct fscache_object * { _enter("{OBJ%x},%d", object->debug_id, event); - fscache_stat(&fscache_n_updates_run); - fscache_stat(&fscache_n_cop_update_object); - object->cache->ops->update_object(object); - fscache_stat_d(&fscache_n_cop_update_object); + fscache_update_aux_data(object); _leave(""); return transit_to(WAIT_FOR_CMD); diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index de67745e1cd7..e30c5975ea58 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -32,7 +32,8 @@ static void fscache_operation_dummy_cancel(struct fscache_operation *op) * Do basic initialisation of an operation. The caller must still set flags, * object and processor if needed. */ -void fscache_operation_init(struct fscache_operation *op, +void fscache_operation_init(struct fscache_cookie *cookie, + struct fscache_operation *op, fscache_operation_processor_t processor, fscache_operation_cancel_t cancel, fscache_operation_release_t release) @@ -46,6 +47,7 @@ void fscache_operation_init(struct fscache_operation *op, op->release = release; INIT_LIST_HEAD(&op->pend_link); fscache_stat(&fscache_n_op_initialised); + trace_fscache_op(cookie, op, fscache_op_init); } EXPORT_SYMBOL(fscache_operation_init); @@ -59,6 +61,8 @@ EXPORT_SYMBOL(fscache_operation_init); */ void fscache_enqueue_operation(struct fscache_operation *op) { + struct fscache_cookie *cookie = op->object->cookie; + _enter("{OBJ%x OP%x,%u}", op->object->debug_id, op->debug_id, atomic_read(&op->usage)); @@ -71,12 +75,14 @@ void fscache_enqueue_operation(struct fscache_operation *op) fscache_stat(&fscache_n_op_enqueue); switch (op->flags & FSCACHE_OP_TYPE) { case FSCACHE_OP_ASYNC: + trace_fscache_op(cookie, op, fscache_op_enqueue_async); _debug("queue async"); atomic_inc(&op->usage); if (!queue_work(fscache_op_wq, &op->work)) fscache_put_operation(op); break; case FSCACHE_OP_MYTHREAD: + trace_fscache_op(cookie, op, fscache_op_enqueue_mythread); _debug("queue for caller's attention"); break; default: @@ -101,6 +107,8 @@ static void fscache_run_op(struct fscache_object *object, wake_up_bit(&op->flags, FSCACHE_OP_WAITING); if (op->processor) fscache_enqueue_operation(op); + else + trace_fscache_op(object->cookie, op, fscache_op_run); fscache_stat(&fscache_n_op_run); } @@ -155,6 +163,8 @@ int fscache_submit_exclusive_op(struct fscache_object *object, _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id); + trace_fscache_op(object->cookie, op, fscache_op_submit_ex); + ASSERTCMP(op->state, ==, FSCACHE_OP_ST_INITIALISED); ASSERTCMP(atomic_read(&op->usage), >, 0); @@ -240,6 +250,8 @@ int fscache_submit_op(struct fscache_object *object, _enter("{OBJ%x OP%x},{%u}", object->debug_id, op->debug_id, atomic_read(&op->usage)); + trace_fscache_op(object->cookie, op, fscache_op_submit); + ASSERTCMP(op->state, ==, FSCACHE_OP_ST_INITIALISED); ASSERTCMP(atomic_read(&op->usage), >, 0); @@ -357,6 +369,8 @@ int fscache_cancel_op(struct fscache_operation *op, _enter("OBJ%x OP%x}", op->object->debug_id, op->debug_id); + trace_fscache_op(object->cookie, op, fscache_op_cancel); + ASSERTCMP(op->state, >=, FSCACHE_OP_ST_PENDING); ASSERTCMP(op->state, !=, FSCACHE_OP_ST_CANCELLED); ASSERTCMP(atomic_read(&op->usage), >, 0); @@ -419,6 +433,8 @@ void fscache_cancel_all_ops(struct fscache_object *object) fscache_stat(&fscache_n_op_cancelled); list_del_init(&op->pend_link); + trace_fscache_op(object->cookie, op, fscache_op_cancel_all); + ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING); op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; @@ -454,9 +470,11 @@ void fscache_op_complete(struct fscache_operation *op, bool cancelled) spin_lock(&object->lock); if (!cancelled) { + trace_fscache_op(object->cookie, op, fscache_op_completed); op->state = FSCACHE_OP_ST_COMPLETE; } else { op->cancel(op); + trace_fscache_op(object->cookie, op, fscache_op_cancelled); op->state = FSCACHE_OP_ST_CANCELLED; } @@ -488,6 +506,8 @@ void fscache_put_operation(struct fscache_operation *op) if (!atomic_dec_and_test(&op->usage)) return; + trace_fscache_op(op->object ? op->object->cookie : NULL, op, fscache_op_put); + _debug("PUT OP"); ASSERTIFCMP(op->state != FSCACHE_OP_ST_INITIALISED && op->state != FSCACHE_OP_ST_COMPLETE, @@ -563,6 +583,8 @@ void fscache_operation_gc(struct work_struct *work) spin_unlock(&cache->op_gc_list_lock); object = op->object; + trace_fscache_op(object->cookie, op, fscache_op_gc); + spin_lock(&object->lock); _debug("GC DEFERRED REL OBJ%x OP%x", @@ -601,6 +623,8 @@ void fscache_op_work_func(struct work_struct *work) _enter("{OBJ%x OP%x,%d}", op->object->debug_id, op->debug_id, atomic_read(&op->usage)); + trace_fscache_op(op->object->cookie, op, fscache_op_work); + ASSERT(op->processor != NULL); start = jiffies; op->processor(op); diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 961029e04027..111349f67d98 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -27,6 +27,7 @@ bool __fscache_check_page_write(struct fscache_cookie *cookie, struct page *page rcu_read_lock(); val = radix_tree_lookup(&cookie->stores, page->index); rcu_read_unlock(); + trace_fscache_check_page(cookie, page, val, 0); return val != NULL; } @@ -39,6 +40,8 @@ void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *pa { wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0); + trace_fscache_page(cookie, page, fscache_page_write_wait); + wait_event(*wq, !__fscache_check_page_write(cookie, page)); } EXPORT_SYMBOL(__fscache_wait_on_page_write); @@ -69,6 +72,8 @@ bool __fscache_maybe_release_page(struct fscache_cookie *cookie, _enter("%p,%p,%x", cookie, page, gfp); + trace_fscache_page(cookie, page, fscache_page_maybe_release); + try_again: rcu_read_lock(); val = radix_tree_lookup(&cookie->stores, page->index); @@ -101,6 +106,7 @@ try_again: } xpage = radix_tree_delete(&cookie->stores, page->index); + trace_fscache_page(cookie, page, fscache_page_radix_delete); spin_unlock(&cookie->stores_lock); if (xpage) { @@ -112,6 +118,7 @@ try_again: } wake_up_bit(&cookie->flags, 0); + trace_fscache_wake_cookie(cookie); if (xpage) put_page(xpage); __fscache_uncache_page(cookie, page); @@ -144,7 +151,7 @@ static void fscache_end_page_write(struct fscache_object *object, struct page *page) { struct fscache_cookie *cookie; - struct page *xpage = NULL; + struct page *xpage = NULL, *val; spin_lock(&object->lock); cookie = object->cookie; @@ -154,13 +161,24 @@ static void fscache_end_page_write(struct fscache_object *object, spin_lock(&cookie->stores_lock); radix_tree_tag_clear(&cookie->stores, page->index, FSCACHE_COOKIE_STORING_TAG); + trace_fscache_page(cookie, page, fscache_page_radix_clear_store); if (!radix_tree_tag_get(&cookie->stores, page->index, FSCACHE_COOKIE_PENDING_TAG)) { fscache_stat(&fscache_n_store_radix_deletes); xpage = radix_tree_delete(&cookie->stores, page->index); + trace_fscache_page(cookie, page, fscache_page_radix_delete); + trace_fscache_page(cookie, page, fscache_page_write_end); + + val = radix_tree_lookup(&cookie->stores, page->index); + trace_fscache_check_page(cookie, page, val, 1); + } else { + trace_fscache_page(cookie, page, fscache_page_write_end_pend); } spin_unlock(&cookie->stores_lock); wake_up_bit(&cookie->flags, 0); + trace_fscache_wake_cookie(cookie); + } else { + trace_fscache_page(cookie, page, fscache_page_write_end_noc); } spin_unlock(&object->lock); if (xpage) @@ -185,9 +203,11 @@ static void fscache_attr_changed_op(struct fscache_operation *op) fscache_stat_d(&fscache_n_cop_attr_changed); if (ret < 0) fscache_abort_object(object); + fscache_op_complete(op, ret < 0); + } else { + fscache_op_complete(op, true); } - fscache_op_complete(op, true); _leave(""); } @@ -213,7 +233,8 @@ int __fscache_attr_changed(struct fscache_cookie *cookie) return -ENOMEM; } - fscache_operation_init(op, fscache_attr_changed_op, NULL, NULL); + fscache_operation_init(cookie, op, fscache_attr_changed_op, NULL, NULL); + trace_fscache_page_op(cookie, NULL, op, fscache_page_op_attr_changed); op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE) | (1 << FSCACHE_OP_UNUSE_COOKIE); @@ -297,7 +318,7 @@ static struct fscache_retrieval *fscache_alloc_retrieval( return NULL; } - fscache_operation_init(&op->op, NULL, + fscache_operation_init(cookie, &op->op, NULL, fscache_do_cancel_retrieval, fscache_release_retrieval_op); op->op.flags = FSCACHE_OP_MYTHREAD | @@ -368,6 +389,7 @@ int fscache_wait_for_operation_activation(struct fscache_object *object, fscache_stat(stat_op_waits); if (wait_on_bit(&op->flags, FSCACHE_OP_WAITING, TASK_INTERRUPTIBLE) != 0) { + trace_fscache_op(object->cookie, op, fscache_op_signal); ret = fscache_cancel_op(op, false); if (ret == 0) return -ERESTARTSYS; @@ -389,6 +411,7 @@ check_if_dead: if (unlikely(fscache_object_is_dying(object) || fscache_cache_is_broken(object))) { enum fscache_operation_state state = op->state; + trace_fscache_op(object->cookie, op, fscache_op_signal); fscache_cancel_op(op, true); if (stat_object_dead) fscache_stat(stat_object_dead); @@ -443,6 +466,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, return -ENOMEM; } atomic_set(&op->n_pages, 1); + trace_fscache_page_op(cookie, page, &op->op, fscache_page_op_retr_one); spin_lock(&cookie->lock); @@ -571,6 +595,7 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, if (!op) return -ENOMEM; atomic_set(&op->n_pages, *nr_pages); + trace_fscache_page_op(cookie, NULL, &op->op, fscache_page_op_retr_multi); spin_lock(&cookie->lock); @@ -682,6 +707,7 @@ int __fscache_alloc_page(struct fscache_cookie *cookie, if (!op) return -ENOMEM; atomic_set(&op->n_pages, 1); + trace_fscache_page_op(cookie, page, &op->op, fscache_page_op_alloc_one); spin_lock(&cookie->lock); @@ -776,15 +802,17 @@ static void fscache_write_op(struct fscache_operation *_op) _enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage)); +again: spin_lock(&object->lock); cookie = object->cookie; if (!fscache_object_is_active(object)) { - /* If we get here, then the on-disk cache object likely longer - * exists, so we should just cancel this write operation. + /* If we get here, then the on-disk cache object likely no + * longer exists, so we should just cancel this write + * operation. */ spin_unlock(&object->lock); - fscache_op_complete(&op->op, false); + fscache_op_complete(&op->op, true); _leave(" [inactive]"); return; } @@ -797,7 +825,7 @@ static void fscache_write_op(struct fscache_operation *_op) * cancel this write operation. */ spin_unlock(&object->lock); - fscache_op_complete(&op->op, false); + fscache_op_complete(&op->op, true); _leave(" [cancel] op{f=%lx s=%u} obj{s=%s f=%lx}", _op->flags, _op->state, object->state->short_name, object->flags); @@ -809,30 +837,33 @@ static void fscache_write_op(struct fscache_operation *_op) fscache_stat(&fscache_n_store_calls); /* find a page to store */ + results[0] = NULL; page = NULL; n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, 1, FSCACHE_COOKIE_PENDING_TAG); + trace_fscache_gang_lookup(cookie, &op->op, results, n, op->store_limit); if (n != 1) goto superseded; page = results[0]; _debug("gang %d [%lx]", n, page->index); - if (page->index >= op->store_limit) { - fscache_stat(&fscache_n_store_pages_over_limit); - goto superseded; - } radix_tree_tag_set(&cookie->stores, page->index, FSCACHE_COOKIE_STORING_TAG); radix_tree_tag_clear(&cookie->stores, page->index, FSCACHE_COOKIE_PENDING_TAG); + trace_fscache_page(cookie, page, fscache_page_radix_pend2store); spin_unlock(&cookie->stores_lock); spin_unlock(&object->lock); + if (page->index >= op->store_limit) + goto discard_page; + fscache_stat(&fscache_n_store_pages); fscache_stat(&fscache_n_cop_write_page); ret = object->cache->ops->write_page(op, page); fscache_stat_d(&fscache_n_cop_write_page); + trace_fscache_wrote_page(cookie, page, &op->op, ret); fscache_end_page_write(object, page); if (ret < 0) { fscache_abort_object(object); @@ -844,6 +875,12 @@ static void fscache_write_op(struct fscache_operation *_op) _leave(""); return; +discard_page: + fscache_stat(&fscache_n_store_pages_over_limit); + trace_fscache_wrote_page(cookie, page, &op->op, -ENOBUFS); + fscache_end_page_write(object, page); + goto again; + superseded: /* this writer is going away and there aren't any more things to * write */ @@ -851,7 +888,7 @@ superseded: spin_unlock(&cookie->stores_lock); clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); spin_unlock(&object->lock); - fscache_op_complete(&op->op, true); + fscache_op_complete(&op->op, false); _leave(""); } @@ -879,6 +916,8 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie) for (i = n - 1; i >= 0; i--) { page = results[i]; radix_tree_delete(&cookie->stores, page->index); + trace_fscache_page(cookie, page, fscache_page_radix_delete); + trace_fscache_page(cookie, page, fscache_page_inval); } spin_unlock(&cookie->stores_lock); @@ -888,6 +927,7 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie) } wake_up_bit(&cookie->flags, 0); + trace_fscache_wake_cookie(cookie); _leave(""); } @@ -923,6 +963,7 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie) */ int __fscache_write_page(struct fscache_cookie *cookie, struct page *page, + loff_t object_size, gfp_t gfp) { struct fscache_storage *op; @@ -946,7 +987,7 @@ int __fscache_write_page(struct fscache_cookie *cookie, if (!op) goto nomem; - fscache_operation_init(&op->op, fscache_write_op, NULL, + fscache_operation_init(cookie, &op->op, fscache_write_op, NULL, fscache_release_write_op); op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING) | @@ -956,6 +997,8 @@ int __fscache_write_page(struct fscache_cookie *cookie, if (ret < 0) goto nomem_free; + trace_fscache_page_op(cookie, page, &op->op, fscache_page_op_write_one); + ret = -ENOBUFS; spin_lock(&cookie->lock); @@ -967,9 +1010,15 @@ int __fscache_write_page(struct fscache_cookie *cookie, if (test_bit(FSCACHE_IOERROR, &object->cache->flags)) goto nobufs; + trace_fscache_page(cookie, page, fscache_page_write); + /* add the page to the pending-storage radix tree on the backing * object */ spin_lock(&object->lock); + + if (object->store_limit_l != object_size) + fscache_set_store_limit(object, object_size); + spin_lock(&cookie->stores_lock); _debug("store limit %llx", (unsigned long long) object->store_limit); @@ -982,8 +1031,10 @@ int __fscache_write_page(struct fscache_cookie *cookie, goto nobufs_unlock_obj; } + trace_fscache_page(cookie, page, fscache_page_radix_insert); radix_tree_tag_set(&cookie->stores, page->index, FSCACHE_COOKIE_PENDING_TAG); + trace_fscache_page(cookie, page, fscache_page_radix_set_pend); get_page(page); /* we only want one writer at a time, but we do need to queue new @@ -1026,6 +1077,7 @@ already_pending: submit_failed: spin_lock(&cookie->stores_lock); radix_tree_delete(&cookie->stores, page->index); + trace_fscache_page(cookie, page, fscache_page_radix_delete); spin_unlock(&cookie->stores_lock); wake_cookie = __fscache_unuse_cookie(cookie); put_page(page); @@ -1072,6 +1124,8 @@ void __fscache_uncache_page(struct fscache_cookie *cookie, struct page *page) if (!PageFsCache(page)) goto done; + trace_fscache_page(cookie, page, fscache_page_uncache); + /* get the object */ spin_lock(&cookie->lock); @@ -1120,6 +1174,8 @@ void fscache_mark_page_cached(struct fscache_retrieval *op, struct page *page) atomic_inc(&fscache_n_marks); #endif + trace_fscache_page(cookie, page, fscache_page_cached); + _debug("- mark %p{%lx}", page, page->index); if (TestSetPageFsCache(page)) { static bool once_only; diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index 7ac6e839b065..fcc8c2f2690e 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -21,7 +21,6 @@ atomic_t fscache_n_op_pend; atomic_t fscache_n_op_run; atomic_t fscache_n_op_enqueue; -atomic_t fscache_n_op_requeue; atomic_t fscache_n_op_deferred_release; atomic_t fscache_n_op_initialised; atomic_t fscache_n_op_release; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 624f18bbfd2b..ef309958e060 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1080,6 +1080,9 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_time_gran = 1; sb->s_export_op = &fuse_export_operations; + sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE; + if (sb->s_user_ns != &init_user_ns) + sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER; file = fget(d.fd); err = -EINVAL; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 685c305cbeb6..278ed0869c3c 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1744,7 +1744,7 @@ do_grow_qunlock: * @newsize: the size to make the file * * The file size can grow, shrink, or stay the same size. This - * is called holding i_mutex and an exclusive glock on the inode + * is called holding i_rwsem and an exclusive glock on the inode * in question. * * Returns: errno diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 82fb5583445c..097bd3c0f270 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1923,28 +1923,37 @@ void gfs2_glock_exit(void) static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi, loff_t n) { - if (n == 0) - gi->gl = rhashtable_walk_peek(&gi->hti); - else { - gi->gl = rhashtable_walk_next(&gi->hti); - n--; + struct gfs2_glock *gl = gi->gl; + + if (gl) { + if (n == 0) + return; + if (!lockref_put_not_zero(&gl->gl_lockref)) + gfs2_glock_queue_put(gl); } for (;;) { - if (IS_ERR_OR_NULL(gi->gl)) { - if (!gi->gl) - return; - if (PTR_ERR(gi->gl) != -EAGAIN) { - gi->gl = NULL; - return; + gl = rhashtable_walk_next(&gi->hti); + if (IS_ERR_OR_NULL(gl)) { + if (gl == ERR_PTR(-EAGAIN)) { + n = 1; + continue; } - n = 0; - } else if (gi->sdp == gi->gl->gl_name.ln_sbd && - !__lockref_is_dead(&gi->gl->gl_lockref)) { - if (!n--) - break; + gl = NULL; + break; + } + if (gl->gl_name.ln_sbd != gi->sdp) + continue; + if (n <= 1) { + if (!lockref_get_not_dead(&gl->gl_lockref)) + continue; + break; + } else { + if (__lockref_is_dead(&gl->gl_lockref)) + continue; + n--; } - gi->gl = rhashtable_walk_next(&gi->hti); } + gi->gl = gl; } static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) @@ -1988,7 +1997,6 @@ static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr) { struct gfs2_glock_iter *gi = seq->private; - gi->gl = NULL; rhashtable_walk_stop(&gi->hti); } @@ -2076,7 +2084,8 @@ static int gfs2_glocks_release(struct inode *inode, struct file *file) struct seq_file *seq = file->private_data; struct gfs2_glock_iter *gi = seq->private; - gi->gl = NULL; + if (gi->gl) + gfs2_glock_put(gi->gl); rhashtable_walk_exit(&gi->hti); return seq_release_private(inode, file); } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index e6a0a8a89ea7..3ba3f167641c 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -825,7 +825,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) goto fail_rindex; } /* - * i_mutex on quota files is special. Since this inode is hidden system + * i_rwsem on quota files is special. Since this inode is hidden system * file, we are safe to define locking ourselves. */ lockdep_set_class(&sdp->sd_quota_inode->i_rwsem, diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 620be0521866..cf5c7f3080d2 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -800,7 +800,7 @@ static void gfs2_dirty_inode(struct inode *inode, int flags) int need_endtrans = 0; int ret; - if (!(flags & (I_DIRTY_DATASYNC|I_DIRTY_SYNC))) + if (!(flags & I_DIRTY_INODE)) return; if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) return; diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 05de20954659..f2bce1e0f6fb 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -308,7 +308,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, } ip->i_inode.i_ctime = current_time(&ip->i_inode); - __mark_inode_dirty(&ip->i_inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC); + __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC); gfs2_trans_end(sdp); @@ -768,7 +768,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, goto out_end_trans; ip->i_inode.i_ctime = current_time(&ip->i_inode); - __mark_inode_dirty(&ip->i_inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC); + __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC); out_end_trans: gfs2_trans_end(GFS2_SB(&ip->i_inode)); @@ -896,7 +896,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh, ea_set_remove_stuffed(ip, es->es_el); ip->i_inode.i_ctime = current_time(&ip->i_inode); - __mark_inode_dirty(&ip->i_inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC); + __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC); gfs2_trans_end(GFS2_SB(&ip->i_inode)); return error; @@ -1114,7 +1114,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) } ip->i_inode.i_ctime = current_time(&ip->i_inode); - __mark_inode_dirty(&ip->i_inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC); + __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC); gfs2_trans_end(GFS2_SB(&ip->i_inode)); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index b9a254dcc0e7..d508c7844681 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -138,10 +138,14 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) /* * page based offset in vm_pgoff could be sufficiently large to - * overflow a (l)off_t when converted to byte offset. + * overflow a loff_t when converted to byte offset. This can + * only happen on architectures where sizeof(loff_t) == + * sizeof(unsigned long). So, only check in those instances. */ - if (vma->vm_pgoff & PGOFF_LOFFT_MAX) - return -EINVAL; + if (sizeof(unsigned long) == sizeof(loff_t)) { + if (vma->vm_pgoff & PGOFF_LOFFT_MAX) + return -EINVAL; + } /* must be huge page aligned */ if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) diff --git a/fs/inode.c b/fs/inode.c index b153aeaa61ea..13ceb98c3bd3 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -348,8 +348,7 @@ EXPORT_SYMBOL(inc_nlink); static void __address_space_init_once(struct address_space *mapping) { - INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC | __GFP_ACCOUNT); - spin_lock_init(&mapping->tree_lock); + INIT_RADIX_TREE(&mapping->i_pages, GFP_ATOMIC | __GFP_ACCOUNT); init_rwsem(&mapping->i_mmap_rwsem); INIT_LIST_HEAD(&mapping->private_list); spin_lock_init(&mapping->private_lock); @@ -504,14 +503,14 @@ EXPORT_SYMBOL(__remove_inode_hash); void clear_inode(struct inode *inode) { /* - * We have to cycle tree_lock here because reclaim can be still in the + * We have to cycle the i_pages lock here because reclaim can be in the * process of removing the last page (in __delete_from_page_cache()) - * and we must not free mapping under it. + * and we must not free the mapping under it. */ - spin_lock_irq(&inode->i_data.tree_lock); + xa_lock_irq(&inode->i_data.i_pages); BUG_ON(inode->i_data.nrpages); BUG_ON(inode->i_data.nrexceptional); - spin_unlock_irq(&inode->i_data.tree_lock); + xa_unlock_irq(&inode->i_data.i_pages); BUG_ON(!list_empty(&inode->i_data.private_list)); BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(inode->i_state & I_CLEAR); diff --git a/fs/internal.h b/fs/internal.h index 980d005b21b4..e08972db0303 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -125,7 +125,6 @@ int do_fchmodat(int dfd, const char __user *filename, umode_t mode); int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, int flag); -extern int open_check_o_direct(struct file *f); extern int vfs_open(const struct path *, struct file *, const struct cred *); extern struct file *filp_clone_open(struct file *); diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index 4a6cf289be24..83b8f06b4a64 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c @@ -21,14 +21,6 @@ #include <linux/pagemap.h> #include "nodelist.h" -struct erase_priv_struct { - struct jffs2_eraseblock *jeb; - struct jffs2_sb_info *c; -}; - -#ifndef __ECOS -static void jffs2_erase_callback(struct erase_info *); -#endif static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset); static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb); static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb); @@ -51,7 +43,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c, jffs2_dbg(1, "%s(): erase block %#08x (range %#08x-%#08x)\n", __func__, jeb->offset, jeb->offset, jeb->offset + c->sector_size); - instr = kmalloc(sizeof(struct erase_info) + sizeof(struct erase_priv_struct), GFP_KERNEL); + instr = kmalloc(sizeof(struct erase_info), GFP_KERNEL); if (!instr) { pr_warn("kmalloc for struct erase_info in jffs2_erase_block failed. Refiling block for later\n"); mutex_lock(&c->erase_free_sem); @@ -67,18 +59,15 @@ static void jffs2_erase_block(struct jffs2_sb_info *c, memset(instr, 0, sizeof(*instr)); - instr->mtd = c->mtd; instr->addr = jeb->offset; instr->len = c->sector_size; - instr->callback = jffs2_erase_callback; - instr->priv = (unsigned long)(&instr[1]); - - ((struct erase_priv_struct *)instr->priv)->jeb = jeb; - ((struct erase_priv_struct *)instr->priv)->c = c; ret = mtd_erase(c->mtd, instr); - if (!ret) + if (!ret) { + jffs2_erase_succeeded(c, jeb); + kfree(instr); return; + } bad_offset = instr->fail_addr; kfree(instr); @@ -214,22 +203,6 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock wake_up(&c->erase_wait); } -#ifndef __ECOS -static void jffs2_erase_callback(struct erase_info *instr) -{ - struct erase_priv_struct *priv = (void *)instr->priv; - - if(instr->state != MTD_ERASE_DONE) { - pr_warn("Erase at 0x%08llx finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n", - (unsigned long long)instr->addr, instr->state); - jffs2_erase_failed(priv->c, priv->jeb, instr->fail_addr); - } else { - jffs2_erase_succeeded(priv->c, priv->jeb); - } - kfree(instr); -} -#endif /* !__ECOS */ - /* Hmmm. Maybe we should accept the extra space it takes and make this a standard doubly-linked list? */ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c, diff --git a/fs/libfs.c b/fs/libfs.c index 7ff3cb904acd..0fb590d79f30 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1060,6 +1060,45 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync) } EXPORT_SYMBOL(noop_fsync); +int noop_set_page_dirty(struct page *page) +{ + /* + * Unlike __set_page_dirty_no_writeback that handles dirty page + * tracking in the page object, dax does all dirty tracking in + * the inode address_space in response to mkwrite faults. In the + * dax case we only need to worry about potentially dirty CPU + * caches, not dirty page cache pages to write back. + * + * This callback is defined to prevent fallback to + * __set_page_dirty_buffers() in set_page_dirty(). + */ + return 0; +} +EXPORT_SYMBOL_GPL(noop_set_page_dirty); + +void noop_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) +{ + /* + * There is no page cache to invalidate in the dax case, however + * we need this callback defined to prevent falling back to + * block_invalidatepage() in do_invalidatepage(). + */ +} +EXPORT_SYMBOL_GPL(noop_invalidatepage); + +ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +{ + /* + * iomap based filesystems support direct I/O without need for + * this callback. However, it still needs to be set in + * inode->a_ops so that open/fcntl know that direct I/O is + * generally supported. + */ + return -EINVAL; +} +EXPORT_SYMBOL_GPL(noop_direct_IO); + /* Because kfree isn't assignment-compatible with void(void*) ;-/ */ void kfree_link(void *p) { diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 9c36d614bf89..346ed161756d 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -57,8 +57,8 @@ static struct task_struct *nlmsvc_task; static struct svc_rqst *nlmsvc_rqst; unsigned long nlmsvc_timeout; -atomic_t nlm_ntf_refcnt = ATOMIC_INIT(0); -DECLARE_WAIT_QUEUE_HEAD(nlm_ntf_wq); +static atomic_t nlm_ntf_refcnt = ATOMIC_INIT(0); +static DECLARE_WAIT_QUEUE_HEAD(nlm_ntf_wq); unsigned int lockd_net_id; diff --git a/fs/locks.c b/fs/locks.c index d6ff4beb70ce..62bbe8b31f26 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -559,7 +559,7 @@ static const struct lock_manager_operations lease_manager_ops = { * Initialize a lease, use the default lock manager operations */ static int lease_init(struct file *filp, long type, struct file_lock *fl) - { +{ if (assign_type(fl, type) != 0) return -EINVAL; diff --git a/fs/namei.c b/fs/namei.c index a09419379f5d..186bd2464fd5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -39,6 +39,7 @@ #include <linux/bitops.h> #include <linux/init_task.h> #include <linux/uaccess.h> +#include <linux/build_bug.h> #include "internal.h" #include "mount.h" @@ -130,6 +131,7 @@ getname_flags(const char __user *filename, int flags, int *empty) struct filename *result; char *kname; int len; + BUILD_BUG_ON(offsetof(struct filename, iname) % sizeof(long) != 0); result = audit_reusename(filename); if (result) @@ -222,9 +224,10 @@ getname_kernel(const char * filename) if (len <= EMBEDDED_NAME_MAX) { result->name = (char *)result->iname; } else if (len <= PATH_MAX) { + const size_t size = offsetof(struct filename, iname[1]); struct filename *tmp; - tmp = kmalloc(sizeof(*tmp), GFP_KERNEL); + tmp = kmalloc(size, GFP_KERNEL); if (unlikely(!tmp)) { __putname(result); return ERR_PTR(-ENOMEM); @@ -927,7 +930,8 @@ static inline int may_follow_link(struct nameidata *nd) if (nd->flags & LOOKUP_RCU) return -ECHILD; - audit_log_link_denied("follow_link", &nd->stack[0].link); + audit_inode(nd->name, nd->stack[0].link.dentry, 0); + audit_log_link_denied("follow_link"); return -EACCES; } @@ -993,7 +997,7 @@ static int may_linkat(struct path *link) if (safe_hardlink_source(inode) || inode_owner_or_capable(inode)) return 0; - audit_log_link_denied("linkat", link); + audit_log_link_denied("linkat"); return -EPERM; } @@ -1594,22 +1598,21 @@ static int lookup_fast(struct nameidata *nd, } /* Fast lookup failed, do it the slow way */ -static struct dentry *lookup_slow(const struct qstr *name, - struct dentry *dir, - unsigned int flags) +static struct dentry *__lookup_slow(const struct qstr *name, + struct dentry *dir, + unsigned int flags) { - struct dentry *dentry = ERR_PTR(-ENOENT), *old; + struct dentry *dentry, *old; struct inode *inode = dir->d_inode; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); - inode_lock_shared(inode); /* Don't go there if it's already dead */ if (unlikely(IS_DEADDIR(inode))) - goto out; + return ERR_PTR(-ENOENT); again: dentry = d_alloc_parallel(dir, name, &wq); if (IS_ERR(dentry)) - goto out; + return dentry; if (unlikely(!d_in_lookup(dentry))) { if (!(flags & LOOKUP_NO_REVAL)) { int error = d_revalidate(dentry, flags); @@ -1631,11 +1634,21 @@ again: dentry = old; } } -out: - inode_unlock_shared(inode); return dentry; } +static struct dentry *lookup_slow(const struct qstr *name, + struct dentry *dir, + unsigned int flags) +{ + struct inode *inode = dir->d_inode; + struct dentry *res; + inode_lock_shared(inode); + res = __lookup_slow(name, dir, flags); + inode_unlock_shared(inode); + return res; +} + static inline int may_lookup(struct nameidata *nd) { if (nd->flags & LOOKUP_RCU) { @@ -2418,56 +2431,63 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, } EXPORT_SYMBOL(vfs_path_lookup); -/** - * lookup_one_len - filesystem helper to lookup single pathname component - * @name: pathname component to lookup - * @base: base directory to lookup from - * @len: maximum length @len should be interpreted to - * - * Note that this routine is purely a helper for filesystem usage and should - * not be called by generic code. - * - * The caller must hold base->i_mutex. - */ -struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) +static int lookup_one_len_common(const char *name, struct dentry *base, + int len, struct qstr *this) { - struct qstr this; - unsigned int c; - int err; - - WARN_ON_ONCE(!inode_is_locked(base->d_inode)); - - this.name = name; - this.len = len; - this.hash = full_name_hash(base, name, len); + this->name = name; + this->len = len; + this->hash = full_name_hash(base, name, len); if (!len) - return ERR_PTR(-EACCES); + return -EACCES; if (unlikely(name[0] == '.')) { if (len < 2 || (len == 2 && name[1] == '.')) - return ERR_PTR(-EACCES); + return -EACCES; } while (len--) { - c = *(const unsigned char *)name++; + unsigned int c = *(const unsigned char *)name++; if (c == '/' || c == '\0') - return ERR_PTR(-EACCES); + return -EACCES; } /* * See if the low-level filesystem might want * to use its own hash.. */ if (base->d_flags & DCACHE_OP_HASH) { - int err = base->d_op->d_hash(base, &this); + int err = base->d_op->d_hash(base, this); if (err < 0) - return ERR_PTR(err); + return err; } - err = inode_permission(base->d_inode, MAY_EXEC); + return inode_permission(base->d_inode, MAY_EXEC); +} + +/** + * lookup_one_len - filesystem helper to lookup single pathname component + * @name: pathname component to lookup + * @base: base directory to lookup from + * @len: maximum length @len should be interpreted to + * + * Note that this routine is purely a helper for filesystem usage and should + * not be called by generic code. + * + * The caller must hold base->i_mutex. + */ +struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) +{ + struct dentry *dentry; + struct qstr this; + int err; + + WARN_ON_ONCE(!inode_is_locked(base->d_inode)); + + err = lookup_one_len_common(name, base, len, &this); if (err) return ERR_PTR(err); - return __lookup_hash(&this, base, 0); + dentry = lookup_dcache(&this, base, 0); + return dentry ? dentry : __lookup_slow(&this, base, 0); } EXPORT_SYMBOL(lookup_one_len); @@ -2487,37 +2507,10 @@ struct dentry *lookup_one_len_unlocked(const char *name, struct dentry *base, int len) { struct qstr this; - unsigned int c; int err; struct dentry *ret; - this.name = name; - this.len = len; - this.hash = full_name_hash(base, name, len); - if (!len) - return ERR_PTR(-EACCES); - - if (unlikely(name[0] == '.')) { - if (len < 2 || (len == 2 && name[1] == '.')) - return ERR_PTR(-EACCES); - } - - while (len--) { - c = *(const unsigned char *)name++; - if (c == '/' || c == '\0') - return ERR_PTR(-EACCES); - } - /* - * See if the low-level filesystem might want - * to use its own hash.. - */ - if (base->d_flags & DCACHE_OP_HASH) { - int err = base->d_op->d_hash(base, &this); - if (err < 0) - return ERR_PTR(err); - } - - err = inode_permission(base->d_inode, MAY_EXEC); + err = lookup_one_len_common(name, base, len, &this); if (err) return ERR_PTR(err); @@ -3374,9 +3367,7 @@ finish_open_created: goto out; *opened |= FILE_OPENED; opened: - error = open_check_o_direct(file); - if (!error) - error = ima_file_check(file, op->acc_mode, *opened); + error = ima_file_check(file, op->acc_mode, *opened); if (!error && will_truncate) error = handle_truncate(file); out: @@ -3456,9 +3447,6 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags, error = finish_open(file, child, NULL, opened); if (error) goto out2; - error = open_check_o_direct(file); - if (error) - fput(file); out2: mnt_drop_write(path.mnt); out: diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 123c069429a7..a813979b5be0 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -535,35 +535,10 @@ static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char return 0; } -#define CB_SUPPORTED_ATTR0 (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) -#define CB_SUPPORTED_ATTR1 (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY) -static __be32 encode_attr_bitmap(struct xdr_stream *xdr, const uint32_t *bitmap, __be32 **savep) +static __be32 encode_attr_bitmap(struct xdr_stream *xdr, const uint32_t *bitmap, size_t sz) { - __be32 bm[2]; - __be32 *p; - - bm[0] = htonl(bitmap[0] & CB_SUPPORTED_ATTR0); - bm[1] = htonl(bitmap[1] & CB_SUPPORTED_ATTR1); - if (bm[1] != 0) { - p = xdr_reserve_space(xdr, 16); - if (unlikely(p == NULL)) - return htonl(NFS4ERR_RESOURCE); - *p++ = htonl(2); - *p++ = bm[0]; - *p++ = bm[1]; - } else if (bm[0] != 0) { - p = xdr_reserve_space(xdr, 12); - if (unlikely(p == NULL)) - return htonl(NFS4ERR_RESOURCE); - *p++ = htonl(1); - *p++ = bm[0]; - } else { - p = xdr_reserve_space(xdr, 8); - if (unlikely(p == NULL)) - return htonl(NFS4ERR_RESOURCE); - *p++ = htonl(0); - } - *savep = p; + if (xdr_stream_encode_uint32_array(xdr, bitmap, sz) < 0) + return cpu_to_be32(NFS4ERR_RESOURCE); return 0; } @@ -656,9 +631,13 @@ static __be32 encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr, if (unlikely(status != 0)) goto out; - status = encode_attr_bitmap(xdr, res->bitmap, &savep); + status = encode_attr_bitmap(xdr, res->bitmap, ARRAY_SIZE(res->bitmap)); if (unlikely(status != 0)) goto out; + status = cpu_to_be32(NFS4ERR_RESOURCE); + savep = xdr_reserve_space(xdr, sizeof(*savep)); + if (unlikely(!savep)) + goto out; status = encode_attr_change(xdr, res->bitmap, res->change_attr); if (unlikely(status != 0)) goto out; diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index d8b47624fee2..1819d0d0ba4b 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -19,6 +19,7 @@ #include <linux/nfs_xdr.h> #include "nfs4_fs.h" +#include "nfs4session.h" #include "delegation.h" #include "internal.h" #include "nfs4trace.h" @@ -171,11 +172,15 @@ again: * nfs_inode_reclaim_delegation - process a delegation reclaim request * @inode: inode to process * @cred: credential to use for request - * @res: new delegation state from server + * @type: delegation type + * @stateid: delegation stateid + * @pagemod_limit: write delegation "space_limit" * */ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, - struct nfs_openres *res) + fmode_t type, + const nfs4_stateid *stateid, + unsigned long pagemod_limit) { struct nfs_delegation *delegation; struct rpc_cred *oldcred = NULL; @@ -185,9 +190,9 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, if (delegation != NULL) { spin_lock(&delegation->lock); if (delegation->inode != NULL) { - nfs4_stateid_copy(&delegation->stateid, &res->delegation); - delegation->type = res->delegation_type; - delegation->pagemod_limit = res->pagemod_limit; + nfs4_stateid_copy(&delegation->stateid, stateid); + delegation->type = type; + delegation->pagemod_limit = pagemod_limit; oldcred = delegation->cred; delegation->cred = get_rpccred(cred); clear_bit(NFS_DELEGATION_NEED_RECLAIM, @@ -195,14 +200,14 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, spin_unlock(&delegation->lock); rcu_read_unlock(); put_rpccred(oldcred); - trace_nfs4_reclaim_delegation(inode, res->delegation_type); + trace_nfs4_reclaim_delegation(inode, type); return; } /* We appear to have raced with a delegation return. */ spin_unlock(&delegation->lock); } rcu_read_unlock(); - nfs_inode_set_delegation(inode, cred, res); + nfs_inode_set_delegation(inode, cred, type, stateid, pagemod_limit); } static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync) @@ -329,11 +334,16 @@ nfs_update_inplace_delegation(struct nfs_delegation *delegation, * nfs_inode_set_delegation - set up a delegation on an inode * @inode: inode to which delegation applies * @cred: cred to use for subsequent delegation processing - * @res: new delegation state from server + * @type: delegation type + * @stateid: delegation stateid + * @pagemod_limit: write delegation "space_limit" * * Returns zero on success, or a negative errno value. */ -int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res) +int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, + fmode_t type, + const nfs4_stateid *stateid, + unsigned long pagemod_limit) { struct nfs_server *server = NFS_SERVER(inode); struct nfs_client *clp = server->nfs_client; @@ -345,9 +355,9 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct delegation = kmalloc(sizeof(*delegation), GFP_NOFS); if (delegation == NULL) return -ENOMEM; - nfs4_stateid_copy(&delegation->stateid, &res->delegation); - delegation->type = res->delegation_type; - delegation->pagemod_limit = res->pagemod_limit; + nfs4_stateid_copy(&delegation->stateid, stateid); + delegation->type = type; + delegation->pagemod_limit = pagemod_limit; delegation->change_attr = inode_peek_iversion_raw(inode); delegation->cred = get_rpccred(cred); delegation->inode = inode; @@ -392,7 +402,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct rcu_assign_pointer(nfsi->delegation, delegation); delegation = NULL; - trace_nfs4_set_delegation(inode, res->delegation_type); + trace_nfs4_set_delegation(inode, type); out: spin_unlock(&clp->cl_lock); @@ -547,6 +557,22 @@ int nfs4_inode_return_delegation(struct inode *inode) return err; } +/** + * nfs4_inode_make_writeable + * @inode: pointer to inode + * + * Make the inode writeable by returning the delegation if necessary + * + * Returns zero on success, or a negative errno value. + */ +int nfs4_inode_make_writeable(struct inode *inode) +{ + if (!nfs4_has_session(NFS_SERVER(inode)->nfs_client) || + !nfs4_check_delegation(inode, FMODE_WRITE)) + return nfs4_inode_return_delegation(inode); + return 0; +} + static void nfs_mark_return_if_closed_delegation(struct nfs_server *server, struct nfs_delegation *delegation) { diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 185a09f37a89..bb1ef8c37af4 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -36,8 +36,10 @@ enum { NFS_DELEGATION_TEST_EXPIRED, }; -int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); -void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); +int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, + fmode_t type, const nfs4_stateid *stateid, unsigned long pagemod_limit); +void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, + fmode_t type, const nfs4_stateid *stateid, unsigned long pagemod_limit); int nfs4_inode_return_delegation(struct inode *inode); int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); void nfs_inode_return_delegation_noreclaim(struct inode *inode); @@ -70,6 +72,7 @@ int nfs4_check_delegation(struct inode *inode, fmode_t flags); bool nfs4_delegation_flush_on_close(const struct inode *inode); void nfs_inode_find_delegation_state_and_recover(struct inode *inode, const nfs4_stateid *stateid); +int nfs4_inode_make_writeable(struct inode *inode); #endif diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 2f3f86726f5b..73f8b43d988c 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1272,7 +1272,9 @@ static void nfs_drop_nlink(struct inode *inode) /* drop the inode if we're reasonably sure this is the last link */ if (inode->i_nlink == 1) clear_nlink(inode); - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR; + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_CHANGE + | NFS_INO_INVALID_CTIME + | NFS_INO_INVALID_OTHER; spin_unlock(&inode->i_lock); } @@ -1798,12 +1800,11 @@ static int nfs_safe_remove(struct dentry *dentry) trace_nfs_remove_enter(dir, dentry); if (inode != NULL) { - NFS_PROTO(inode)->return_delegation(inode); - error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); + error = NFS_PROTO(dir)->remove(dir, dentry); if (error == 0) nfs_drop_nlink(inode); } else - error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); + error = NFS_PROTO(dir)->remove(dir, dentry); if (error == -ENOENT) nfs_dentry_handle_enoent(dentry); trace_nfs_remove_exit(dir, dentry, error); @@ -1932,8 +1933,6 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) old_dentry, dentry); trace_nfs_link_enter(inode, dir, dentry); - NFS_PROTO(inode)->return_delegation(inode); - d_drop(dentry); error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); if (error == 0) { @@ -2023,10 +2022,6 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, } } - NFS_PROTO(old_inode)->return_delegation(old_inode); - if (new_inode != NULL) - NFS_PROTO(new_inode)->return_delegation(new_inode); - task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL); if (IS_ERR(task)) { error = PTR_ERR(task); diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c index 0ee4b93d36ea..1c5d8d31fc0a 100644 --- a/fs/nfs/fscache-index.c +++ b/fs/nfs/fscache-index.c @@ -50,59 +50,6 @@ void nfs_fscache_unregister(void) } /* - * Layout of the key for an NFS server cache object. - */ -struct nfs_server_key { - uint16_t nfsversion; /* NFS protocol version */ - uint16_t family; /* address family */ - uint16_t port; /* IP port */ - union { - struct in_addr ipv4_addr; /* IPv4 address */ - struct in6_addr ipv6_addr; /* IPv6 address */ - } addr[0]; -}; - -/* - * Generate a key to describe a server in the main NFS index - * - We return the length of the key, or 0 if we can't generate one - */ -static uint16_t nfs_server_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - const struct nfs_client *clp = cookie_netfs_data; - const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr; - const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr; - struct nfs_server_key *key = buffer; - uint16_t len = sizeof(struct nfs_server_key); - - memset(key, 0, len); - key->nfsversion = clp->rpc_ops->version; - key->family = clp->cl_addr.ss_family; - - switch (clp->cl_addr.ss_family) { - case AF_INET: - key->port = sin->sin_port; - key->addr[0].ipv4_addr = sin->sin_addr; - len += sizeof(key->addr[0].ipv4_addr); - break; - - case AF_INET6: - key->port = sin6->sin6_port; - key->addr[0].ipv6_addr = sin6->sin6_addr; - len += sizeof(key->addr[0].ipv6_addr); - break; - - default: - printk(KERN_WARNING "NFS: Unknown network family '%d'\n", - clp->cl_addr.ss_family); - len = 0; - break; - } - - return len; -} - -/* * Define the server object for FS-Cache. This is used to describe a server * object to fscache_acquire_cookie(). It is keyed by the NFS protocol and * server address parameters. @@ -110,33 +57,9 @@ static uint16_t nfs_server_get_key(const void *cookie_netfs_data, const struct fscache_cookie_def nfs_fscache_server_index_def = { .name = "NFS.server", .type = FSCACHE_COOKIE_TYPE_INDEX, - .get_key = nfs_server_get_key, }; /* - * Generate a key to describe a superblock key in the main NFS index - */ -static uint16_t nfs_super_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - const struct nfs_fscache_key *key; - const struct nfs_server *nfss = cookie_netfs_data; - uint16_t len; - - key = nfss->fscache_key; - len = sizeof(key->key) + key->key.uniq_len; - if (len > bufmax) { - len = 0; - } else { - memcpy(buffer, &key->key, sizeof(key->key)); - memcpy(buffer + sizeof(key->key), - key->key.uniquifier, key->key.uniq_len); - } - - return len; -} - -/* * Define the superblock object for FS-Cache. This is used to describe a * superblock object to fscache_acquire_cookie(). It is keyed by all the NFS * parameters that might cause a separate superblock. @@ -144,84 +67,9 @@ static uint16_t nfs_super_get_key(const void *cookie_netfs_data, const struct fscache_cookie_def nfs_fscache_super_index_def = { .name = "NFS.super", .type = FSCACHE_COOKIE_TYPE_INDEX, - .get_key = nfs_super_get_key, }; /* - * Definition of the auxiliary data attached to NFS inode storage objects - * within the cache. - * - * The contents of this struct are recorded in the on-disk local cache in the - * auxiliary data attached to the data storage object backing an inode. This - * permits coherency to be managed when a new inode binds to an already extant - * cache object. - */ -struct nfs_fscache_inode_auxdata { - struct timespec mtime; - struct timespec ctime; - loff_t size; - u64 change_attr; -}; - -/* - * Generate a key to describe an NFS inode in an NFS server's index - */ -static uint16_t nfs_fscache_inode_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - const struct nfs_inode *nfsi = cookie_netfs_data; - uint16_t nsize; - - /* use the inode's NFS filehandle as the key */ - nsize = nfsi->fh.size; - memcpy(buffer, nfsi->fh.data, nsize); - return nsize; -} - -/* - * Get certain file attributes from the netfs data - * - This function can be absent for an index - * - Not permitted to return an error - * - The netfs data from the cookie being used as the source is presented - */ -static void nfs_fscache_inode_get_attr(const void *cookie_netfs_data, - uint64_t *size) -{ - const struct nfs_inode *nfsi = cookie_netfs_data; - - *size = nfsi->vfs_inode.i_size; -} - -/* - * Get the auxiliary data from netfs data - * - This function can be absent if the index carries no state data - * - Should store the auxiliary data in the buffer - * - Should return the amount of amount stored - * - Not permitted to return an error - * - The netfs data from the cookie being used as the source is presented - */ -static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - struct nfs_fscache_inode_auxdata auxdata; - const struct nfs_inode *nfsi = cookie_netfs_data; - - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.size = nfsi->vfs_inode.i_size; - auxdata.mtime = nfsi->vfs_inode.i_mtime; - auxdata.ctime = nfsi->vfs_inode.i_ctime; - - if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) - auxdata.change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode); - - if (bufmax > sizeof(auxdata)) - bufmax = sizeof(auxdata); - - memcpy(buffer, &auxdata, bufmax); - return bufmax; -} - -/* * Consult the netfs about the state of an object * - This function can be absent if the index carries no state data * - The netfs data from the cookie being used as the target is @@ -230,7 +78,8 @@ static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data, static enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data, const void *data, - uint16_t datalen) + uint16_t datalen, + loff_t object_size) { struct nfs_fscache_inode_auxdata auxdata; struct nfs_inode *nfsi = cookie_netfs_data; @@ -239,7 +88,6 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data, return FSCACHE_CHECKAUX_OBSOLETE; memset(&auxdata, 0, sizeof(auxdata)); - auxdata.size = nfsi->vfs_inode.i_size; auxdata.mtime = nfsi->vfs_inode.i_mtime; auxdata.ctime = nfsi->vfs_inode.i_ctime; @@ -288,9 +136,6 @@ static void nfs_fh_put_context(void *cookie_netfs_data, void *context) const struct fscache_cookie_def nfs_fscache_inode_object_def = { .name = "NFS.fh", .type = FSCACHE_COOKIE_TYPE_DATAFILE, - .get_key = nfs_fscache_inode_get_key, - .get_attr = nfs_fscache_inode_get_attr, - .get_aux = nfs_fscache_inode_get_aux, .check_aux = nfs_fscache_inode_check_aux, .get_context = nfs_fh_get_context, .put_context = nfs_fh_put_context, diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index d63bea8bbfbb..b55fc7920c3b 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -18,6 +18,7 @@ #include <linux/in6.h> #include <linux/seq_file.h> #include <linux/slab.h> +#include <linux/iversion.h> #include "internal.h" #include "iostat.h" @@ -29,6 +30,21 @@ static struct rb_root nfs_fscache_keys = RB_ROOT; static DEFINE_SPINLOCK(nfs_fscache_keys_lock); /* + * Layout of the key for an NFS server cache object. + */ +struct nfs_server_key { + struct { + uint16_t nfsversion; /* NFS protocol version */ + uint16_t family; /* address family */ + __be16 port; /* IP port */ + } hdr; + union { + struct in_addr ipv4_addr; /* IPv4 address */ + struct in6_addr ipv6_addr; /* IPv6 address */ + }; +} __packed; + +/* * Get the per-client index cookie for an NFS client if the appropriate mount * flag was set * - We always try and get an index cookie for the client, but get filehandle @@ -36,10 +52,41 @@ static DEFINE_SPINLOCK(nfs_fscache_keys_lock); */ void nfs_fscache_get_client_cookie(struct nfs_client *clp) { + const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr; + const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr; + struct nfs_server_key key; + uint16_t len = sizeof(key.hdr); + + memset(&key, 0, sizeof(key)); + key.hdr.nfsversion = clp->rpc_ops->version; + key.hdr.family = clp->cl_addr.ss_family; + + switch (clp->cl_addr.ss_family) { + case AF_INET: + key.hdr.port = sin->sin_port; + key.ipv4_addr = sin->sin_addr; + len += sizeof(key.ipv4_addr); + break; + + case AF_INET6: + key.hdr.port = sin6->sin6_port; + key.ipv6_addr = sin6->sin6_addr; + len += sizeof(key.ipv6_addr); + break; + + default: + printk(KERN_WARNING "NFS: Unknown network family '%d'\n", + clp->cl_addr.ss_family); + clp->fscache = NULL; + return; + } + /* create a cache index for looking up filehandles */ clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index, &nfs_fscache_server_index_def, - clp, true); + &key, len, + NULL, 0, + clp, 0, true); dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n", clp, clp->fscache); } @@ -52,7 +99,7 @@ void nfs_fscache_release_client_cookie(struct nfs_client *clp) dfprintk(FSCACHE, "NFS: releasing client cookie (0x%p/0x%p)\n", clp, clp->fscache); - fscache_relinquish_cookie(clp->fscache, 0); + fscache_relinquish_cookie(clp->fscache, NULL, false); clp->fscache = NULL; } @@ -139,7 +186,9 @@ void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int /* create a cache index for looking up filehandles */ nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache, &nfs_fscache_super_index_def, - nfss, true); + key, sizeof(*key) + ulen, + NULL, 0, + nfss, 0, true); dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n", nfss, nfss->fscache); return; @@ -163,7 +212,7 @@ void nfs_fscache_release_super_cookie(struct super_block *sb) dfprintk(FSCACHE, "NFS: releasing superblock cookie (0x%p/0x%p)\n", nfss, nfss->fscache); - fscache_relinquish_cookie(nfss->fscache, 0); + fscache_relinquish_cookie(nfss->fscache, NULL, false); nfss->fscache = NULL; if (nfss->fscache_key) { @@ -180,14 +229,25 @@ void nfs_fscache_release_super_cookie(struct super_block *sb) */ void nfs_fscache_init_inode(struct inode *inode) { + struct nfs_fscache_inode_auxdata auxdata; struct nfs_inode *nfsi = NFS_I(inode); nfsi->fscache = NULL; if (!S_ISREG(inode->i_mode)) return; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.mtime = nfsi->vfs_inode.i_mtime; + auxdata.ctime = nfsi->vfs_inode.i_ctime; + + if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) + auxdata.change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode); + nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache, &nfs_fscache_inode_object_def, - nfsi, false); + nfsi->fh.data, nfsi->fh.size, + &auxdata, sizeof(auxdata), + nfsi, nfsi->vfs_inode.i_size, false); } /* @@ -195,12 +255,16 @@ void nfs_fscache_init_inode(struct inode *inode) */ void nfs_fscache_clear_inode(struct inode *inode) { + struct nfs_fscache_inode_auxdata auxdata; struct nfs_inode *nfsi = NFS_I(inode); struct fscache_cookie *cookie = nfs_i_fscache(inode); dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie); - fscache_relinquish_cookie(cookie, false); + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.mtime = nfsi->vfs_inode.i_mtime; + auxdata.ctime = nfsi->vfs_inode.i_ctime; + fscache_relinquish_cookie(cookie, &auxdata, false); nfsi->fscache = NULL; } @@ -232,20 +296,26 @@ static bool nfs_fscache_can_enable(void *data) */ void nfs_fscache_open_file(struct inode *inode, struct file *filp) { + struct nfs_fscache_inode_auxdata auxdata; struct nfs_inode *nfsi = NFS_I(inode); struct fscache_cookie *cookie = nfs_i_fscache(inode); if (!fscache_cookie_valid(cookie)) return; + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.mtime = nfsi->vfs_inode.i_mtime; + auxdata.ctime = nfsi->vfs_inode.i_ctime; + if (inode_is_open_for_write(inode)) { dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi); clear_bit(NFS_INO_FSCACHE, &nfsi->flags); - fscache_disable_cookie(cookie, true); + fscache_disable_cookie(cookie, &auxdata, true); fscache_uncache_all_inode_pages(cookie, inode); } else { dfprintk(FSCACHE, "NFS: nfsi 0x%p enabling cache\n", nfsi); - fscache_enable_cookie(cookie, nfs_fscache_can_enable, inode); + fscache_enable_cookie(cookie, &auxdata, nfsi->vfs_inode.i_size, + nfs_fscache_can_enable, inode); if (fscache_cookie_enabled(cookie)) set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags); } @@ -422,7 +492,8 @@ void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync) "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n", nfs_i_fscache(inode), page, page->index, page->flags, sync); - ret = fscache_write_page(nfs_i_fscache(inode), page, GFP_KERNEL); + ret = fscache_write_page(nfs_i_fscache(inode), page, + inode->i_size, GFP_KERNEL); dfprintk(FSCACHE, "NFS: readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n", page, page->index, page->flags, ret); diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index d7fe3e799f2f..161ba2edb9d0 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -57,6 +57,21 @@ struct nfs_fscache_key { }; /* + * Definition of the auxiliary data attached to NFS inode storage objects + * within the cache. + * + * The contents of this struct are recorded in the on-disk local cache in the + * auxiliary data attached to the data storage object backing an inode. This + * permits coherency to be managed when a new inode binds to an already extant + * cache object. + */ +struct nfs_fscache_inode_auxdata { + struct timespec mtime; + struct timespec ctime; + u64 change_attr; +}; + +/* * fscache-index.c */ extern struct fscache_netfs nfs_fscache_netfs; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index d17a90c4fa37..bd15d0b57626 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -195,7 +195,10 @@ bool nfs_check_cache_invalid(struct inode *inode, unsigned long flags) static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) { struct nfs_inode *nfsi = NFS_I(inode); + bool have_delegation = nfs_have_delegated_attributes(inode); + if (have_delegation) + flags &= ~(NFS_INO_INVALID_CHANGE|NFS_INO_REVAL_PAGECACHE); if (inode->i_mapping->nrpages == 0) flags &= ~NFS_INO_INVALID_DATA; nfsi->cache_validity |= flags; @@ -447,7 +450,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st inode->i_mode = fattr->mode; if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0 && nfs_server_capable(inode, NFS_CAP_MODE)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); /* Why so? Because we want revalidate for devices/FIFOs, and * that's precisely what we have in nfs_file_inode_operations. */ @@ -493,37 +496,35 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st if (fattr->valid & NFS_ATTR_FATTR_ATIME) inode->i_atime = fattr->atime; else if (nfs_server_capable(inode, NFS_CAP_ATIME)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); if (fattr->valid & NFS_ATTR_FATTR_MTIME) inode->i_mtime = fattr->mtime; else if (nfs_server_capable(inode, NFS_CAP_MTIME)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) inode->i_ctime = fattr->ctime; else if (nfs_server_capable(inode, NFS_CAP_CTIME)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_CTIME); if (fattr->valid & NFS_ATTR_FATTR_CHANGE) inode_set_iversion_raw(inode, fattr->change_attr); else - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR - | NFS_INO_REVAL_PAGECACHE); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE); if (fattr->valid & NFS_ATTR_FATTR_SIZE) inode->i_size = nfs_size_to_loff_t(fattr->size); else - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR - | NFS_INO_REVAL_PAGECACHE); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_SIZE); if (fattr->valid & NFS_ATTR_FATTR_NLINK) set_nlink(inode, fattr->nlink); else if (nfs_server_capable(inode, NFS_CAP_NLINK)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); if (fattr->valid & NFS_ATTR_FATTR_OWNER) inode->i_uid = fattr->uid; else if (nfs_server_capable(inode, NFS_CAP_OWNER)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); if (fattr->valid & NFS_ATTR_FATTR_GROUP) inode->i_gid = fattr->gid; else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) inode->i_blocks = fattr->du.nfs2.blocks; if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { @@ -608,11 +609,6 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) goto out; } - /* - * Return any delegations if we're going to change ACLs - */ - if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) - NFS_PROTO(inode)->return_delegation(inode); error = NFS_PROTO(inode)->setattr(dentry, fattr, attr); if (error == 0) error = nfs_refresh_inode(inode, fattr); @@ -645,6 +641,7 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset) /* Optimisation */ if (offset == 0) NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA; + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE; spin_unlock(&inode->i_lock); truncate_pagecache(inode, offset); @@ -657,6 +654,7 @@ out: * nfs_setattr_update_inode - Update inode metadata after a setattr call. * @inode: pointer to struct inode * @attr: pointer to struct iattr + * @fattr: pointer to struct nfs_fattr * * Note: we do this in the *proc.c in order to ensure that * it works for things like exclusive creates too. @@ -669,6 +667,8 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, spin_lock(&inode->i_lock); NFS_I(inode)->attr_gencount = fattr->gencount; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE + | NFS_INO_INVALID_CTIME); if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) { if ((attr->ia_valid & ATTR_MODE) != 0) { int mode = attr->ia_mode & S_IALLUGO; @@ -683,13 +683,12 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, | NFS_INO_INVALID_ACL); } if ((attr->ia_valid & ATTR_SIZE) != 0) { + nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC); nfs_vmtruncate(inode, attr->ia_size); } if (fattr->valid) nfs_update_inode(inode, fattr); - else - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR; spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_setattr_update_inode); @@ -1303,24 +1302,20 @@ static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi) return nfs_file_has_writers(nfsi) && nfs_file_io_is_buffered(nfsi); } -static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) +static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) { - unsigned long ret = 0; - if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) && (fattr->valid & NFS_ATTR_FATTR_CHANGE) && inode_eq_iversion_raw(inode, fattr->pre_change_attr)) { inode_set_iversion_raw(inode, fattr->change_attr); if (S_ISDIR(inode->i_mode)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); - ret |= NFS_INO_INVALID_ATTR; } /* If we have atomic WCC data, we may update some attributes */ if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME) && (fattr->valid & NFS_ATTR_FATTR_CTIME) && timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) { memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); - ret |= NFS_INO_INVALID_ATTR; } if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME) @@ -1329,17 +1324,13 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); if (S_ISDIR(inode->i_mode)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); - ret |= NFS_INO_INVALID_ATTR; } if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE) && (fattr->valid & NFS_ATTR_FATTR_SIZE) && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) && !nfs_have_writebacks(inode)) { i_size_write(inode, nfs_size_to_loff_t(fattr->size)); - ret |= NFS_INO_INVALID_ATTR; } - - return ret; } /** @@ -1369,33 +1360,41 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if (!nfs_file_has_buffered_writers(nfsi)) { /* Verify a few of the more important attributes */ if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && !inode_eq_iversion_raw(inode, fattr->change_attr)) - invalid |= NFS_INO_INVALID_ATTR | NFS_INO_REVAL_PAGECACHE; + invalid |= NFS_INO_INVALID_CHANGE + | NFS_INO_REVAL_PAGECACHE; if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime)) - invalid |= NFS_INO_INVALID_ATTR; + invalid |= NFS_INO_INVALID_MTIME; if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec_equal(&inode->i_ctime, &fattr->ctime)) - invalid |= NFS_INO_INVALID_ATTR; + invalid |= NFS_INO_INVALID_CTIME; if (fattr->valid & NFS_ATTR_FATTR_SIZE) { cur_size = i_size_read(inode); new_isize = nfs_size_to_loff_t(fattr->size); if (cur_size != new_isize) - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + invalid |= NFS_INO_INVALID_SIZE + | NFS_INO_REVAL_PAGECACHE; } } /* Have any file permissions changed? */ if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) - invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; + invalid |= NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_OTHER; if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && !uid_eq(inode->i_uid, fattr->uid)) - invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; + invalid |= NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_OTHER; if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && !gid_eq(inode->i_gid, fattr->gid)) - invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; + invalid |= NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_OTHER; /* Has the link count changed? */ if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink) - invalid |= NFS_INO_INVALID_ATTR; + invalid |= NFS_INO_INVALID_OTHER; if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&inode->i_atime, &fattr->atime)) invalid |= NFS_INO_INVALID_ATIME; @@ -1597,10 +1596,9 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) } EXPORT_SYMBOL_GPL(nfs_refresh_inode); -static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr) +static int nfs_post_op_update_inode_locked(struct inode *inode, + struct nfs_fattr *fattr, unsigned int invalid) { - unsigned long invalid = NFS_INO_INVALID_ATTR; - if (S_ISDIR(inode->i_mode)) invalid |= NFS_INO_INVALID_DATA; nfs_set_cache_invalid(inode, invalid); @@ -1629,7 +1627,9 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) spin_lock(&inode->i_lock); nfs_fattr_set_barrier(fattr); - status = nfs_post_op_update_inode_locked(inode, fattr); + status = nfs_post_op_update_inode_locked(inode, fattr, + NFS_INO_INVALID_CHANGE + | NFS_INO_INVALID_CTIME); spin_unlock(&inode->i_lock); return status; @@ -1681,7 +1681,10 @@ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fa fattr->valid |= NFS_ATTR_FATTR_PRESIZE; } out_noforce: - status = nfs_post_op_update_inode_locked(inode, fattr); + status = nfs_post_op_update_inode_locked(inode, fattr, + NFS_INO_INVALID_CHANGE + | NFS_INO_INVALID_CTIME + | NFS_INO_INVALID_MTIME); return status; } @@ -1789,7 +1792,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) | NFS_INO_REVAL_PAGECACHE); /* Do atomic weak cache consistency updates */ - invalid |= nfs_wcc_update_inode(inode, fattr); + nfs_wcc_update_inode(inode, fattr); if (pnfs_layoutcommit_outstanding(inode)) { nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_ATTR; @@ -1803,17 +1806,25 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_sb->s_id, inode->i_ino); /* Could it be a race with writeback? */ if (!have_writers) { - invalid |= NFS_INO_INVALID_ATTR + invalid |= NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_DATA | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; + /* Force revalidate of all attributes */ + save_cache_validity |= NFS_INO_INVALID_CTIME + | NFS_INO_INVALID_MTIME + | NFS_INO_INVALID_SIZE + | NFS_INO_INVALID_OTHER; if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); } inode_set_iversion_raw(inode, fattr->change_attr); } } else { - nfsi->cache_validity |= save_cache_validity; + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_CHANGE + | NFS_INO_REVAL_PAGECACHE + | NFS_INO_REVAL_FORCED); cache_revalidated = false; } @@ -1821,7 +1832,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); } else if (server->caps & NFS_CAP_MTIME) { nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_ATTR + (NFS_INO_INVALID_MTIME | NFS_INO_REVAL_FORCED); cache_revalidated = false; } @@ -1830,7 +1841,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); } else if (server->caps & NFS_CAP_CTIME) { nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_ATTR + (NFS_INO_INVALID_CTIME | NFS_INO_REVAL_FORCED); cache_revalidated = false; } @@ -1845,7 +1856,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (!nfs_have_writebacks(inode) || new_isize > cur_isize) { i_size_write(inode, new_isize); if (!have_writers) - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + invalid |= NFS_INO_INVALID_DATA; } dprintk("NFS: isize change on server for file %s/%ld " "(%Ld to %Ld)\n", @@ -1856,7 +1867,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } } else { nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_ATTR + (NFS_INO_INVALID_SIZE | NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED); cache_revalidated = false; @@ -1877,55 +1888,61 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) umode_t newmode = inode->i_mode & S_IFMT; newmode |= fattr->mode & S_IALLUGO; inode->i_mode = newmode; - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + invalid |= NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_OTHER; } } else if (server->caps & NFS_CAP_MODE) { nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_ACCESS + (NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_OTHER | NFS_INO_REVAL_FORCED); cache_revalidated = false; } if (fattr->valid & NFS_ATTR_FATTR_OWNER) { if (!uid_eq(inode->i_uid, fattr->uid)) { - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + invalid |= NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_OTHER; inode->i_uid = fattr->uid; } } else if (server->caps & NFS_CAP_OWNER) { nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_ACCESS + (NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_OTHER | NFS_INO_REVAL_FORCED); cache_revalidated = false; } if (fattr->valid & NFS_ATTR_FATTR_GROUP) { if (!gid_eq(inode->i_gid, fattr->gid)) { - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + invalid |= NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_OTHER; inode->i_gid = fattr->gid; } } else if (server->caps & NFS_CAP_OWNER_GROUP) { nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_ACCESS + (NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_OTHER | NFS_INO_REVAL_FORCED); cache_revalidated = false; } if (fattr->valid & NFS_ATTR_FATTR_NLINK) { if (inode->i_nlink != fattr->nlink) { - invalid |= NFS_INO_INVALID_ATTR; + invalid |= NFS_INO_INVALID_OTHER; if (S_ISDIR(inode->i_mode)) invalid |= NFS_INO_INVALID_DATA; set_nlink(inode, fattr->nlink); } } else if (server->caps & NFS_CAP_NLINK) { nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_ATTR + (NFS_INO_INVALID_OTHER | NFS_INO_REVAL_FORCED); cache_revalidated = false; } @@ -1942,6 +1959,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* Update attrtimeo value if we're out of the unstable period */ if (invalid & NFS_INO_INVALID_ATTR) { + invalid &= ~NFS_INO_INVALID_ATTR; nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; @@ -1962,10 +1980,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfsi->attr_gencount = fattr->gencount; } - /* Don't declare attrcache up to date if there were no attrs! */ - if (cache_revalidated) - invalid &= ~NFS_INO_INVALID_ATTR; - /* Don't invalidate the data if we were to blame */ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 7327930ad970..eadf1ab31d16 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -138,8 +138,11 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, msg.rpc_cred = nfs_file_cred(sattr->ia_file); nfs_fattr_init(fattr); status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); - if (status == 0) + if (status == 0) { + if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL) + nfs_zap_acl_cache(inode); nfs_setattr_update_inode(inode, sattr, fattr); + } dprintk("NFS reply setattr: %d\n", status); return status; } @@ -383,11 +386,11 @@ out: } static int -nfs3_proc_remove(struct inode *dir, const struct qstr *name) +nfs3_proc_remove(struct inode *dir, struct dentry *dentry) { struct nfs_removeargs arg = { .fh = NFS_FH(dir), - .name = *name, + .name = dentry->d_name, }; struct nfs_removeres res; struct rpc_message msg = { @@ -397,7 +400,7 @@ nfs3_proc_remove(struct inode *dir, const struct qstr *name) }; int status = -ENOMEM; - dprintk("NFS call remove %s\n", name->name); + dprintk("NFS call remove %pd2\n", dentry); res.dir_attr = nfs_alloc_fattr(); if (res.dir_attr == NULL) goto out; @@ -411,7 +414,7 @@ out: } static void -nfs3_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) +nfs3_proc_unlink_setup(struct rpc_message *msg, struct dentry *dentry) { msg->rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE]; } @@ -433,7 +436,9 @@ nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir) } static void -nfs3_proc_rename_setup(struct rpc_message *msg, struct inode *dir) +nfs3_proc_rename_setup(struct rpc_message *msg, + struct dentry *old_dentry, + struct dentry *new_dentry) { msg->rpc_proc = &nfs3_procedures[NFS3PROC_RENAME]; } @@ -908,12 +913,6 @@ static int nfs3_have_delegation(struct inode *inode, fmode_t flags) return 0; } -static int nfs3_return_delegation(struct inode *inode) -{ - nfs_wb_all(inode); - return 0; -} - static const struct inode_operations nfs3_dir_inode_operations = { .create = nfs_create, .lookup = nfs_lookup, @@ -990,7 +989,6 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .clear_acl_cache = forget_all_cached_acls, .close_context = nfs_close_context, .have_delegation = nfs3_have_delegation, - .return_delegation = nfs3_return_delegation, .alloc_client = nfs_alloc_client, .init_client = nfs_init_client, .free_client = nfs_free_client, diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 6cd33bd5da87..09ee36dd8426 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -1997,6 +1997,7 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_entry old = *entry; __be32 *p; int error; + u64 new_cookie; p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) @@ -2019,8 +2020,7 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, if (unlikely(error)) return error; - entry->prev_cookie = entry->cookie; - error = decode_cookie3(xdr, &entry->cookie); + error = decode_cookie3(xdr, &new_cookie); if (unlikely(error)) return error; @@ -2054,6 +2054,9 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, zero_nfs_fh3(entry->fh); } + entry->prev_cookie = entry->cookie; + entry->cookie = new_cookie; + return 0; out_overflow: diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 47f3c273245e..b71757e85066 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1045,7 +1045,9 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo, struct nfs_inode *nfsi = NFS_I(dir); spin_lock(&dir->i_lock); - nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + nfsi->cache_validity |= NFS_INO_INVALID_CTIME + | NFS_INO_INVALID_MTIME + | NFS_INO_INVALID_DATA; if (cinfo->atomic && cinfo->before == inode_peek_iversion_raw(dir)) { nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE; nfsi->attrtimeo_timestamp = jiffies; @@ -1669,6 +1671,7 @@ static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmo { struct nfs_delegation *delegation; + fmode &= FMODE_READ|FMODE_WRITE; rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); if (delegation == NULL || (delegation->type & fmode) == fmode) { @@ -1751,12 +1754,16 @@ nfs4_opendata_check_deleg(struct nfs4_opendata *data, struct nfs4_state *state) } if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0) nfs_inode_set_delegation(state->inode, - data->owner->so_cred, - &data->o_res); + data->owner->so_cred, + data->o_res.delegation_type, + &data->o_res.delegation, + data->o_res.pagemod_limit); else nfs_inode_reclaim_delegation(state->inode, - data->owner->so_cred, - &data->o_res); + data->owner->so_cred, + data->o_res.delegation_type, + &data->o_res.delegation, + data->o_res.pagemod_limit); } /* @@ -2743,27 +2750,40 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st * fields corresponding to attributes that were used to store the verifier. * Make sure we clobber those fields in the later setattr call */ -static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, +static unsigned nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct iattr *sattr, struct nfs4_label **label) { - const u32 *attrset = opendata->o_res.attrset; + const __u32 *bitmask = opendata->o_arg.server->exclcreat_bitmask; + __u32 attrset[3]; + unsigned ret; + unsigned i; - if ((attrset[1] & FATTR4_WORD1_TIME_ACCESS) && - !(sattr->ia_valid & ATTR_ATIME_SET)) - sattr->ia_valid |= ATTR_ATIME; + for (i = 0; i < ARRAY_SIZE(attrset); i++) { + attrset[i] = opendata->o_res.attrset[i]; + if (opendata->o_arg.createmode == NFS4_CREATE_EXCLUSIVE4_1) + attrset[i] &= ~bitmask[i]; + } + + ret = (opendata->o_arg.createmode == NFS4_CREATE_EXCLUSIVE) ? + sattr->ia_valid : 0; - if ((attrset[1] & FATTR4_WORD1_TIME_MODIFY) && - !(sattr->ia_valid & ATTR_MTIME_SET)) - sattr->ia_valid |= ATTR_MTIME; + if ((attrset[1] & (FATTR4_WORD1_TIME_ACCESS|FATTR4_WORD1_TIME_ACCESS_SET))) { + if (sattr->ia_valid & ATTR_ATIME_SET) + ret |= ATTR_ATIME_SET; + else + ret |= ATTR_ATIME; + } - /* Except MODE, it seems harmless of setting twice. */ - if (opendata->o_arg.createmode != NFS4_CREATE_EXCLUSIVE && - (attrset[1] & FATTR4_WORD1_MODE || - attrset[2] & FATTR4_WORD2_MODE_UMASK)) - sattr->ia_valid &= ~ATTR_MODE; + if ((attrset[1] & (FATTR4_WORD1_TIME_MODIFY|FATTR4_WORD1_TIME_MODIFY_SET))) { + if (sattr->ia_valid & ATTR_MTIME_SET) + ret |= ATTR_MTIME_SET; + else + ret |= ATTR_MTIME; + } - if (attrset[2] & FATTR4_WORD2_SECURITY_LABEL) + if (!(attrset[2] & FATTR4_WORD2_SECURITY_LABEL)) *label = NULL; + return ret; } static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, @@ -2892,12 +2912,15 @@ static int _nfs4_do_open(struct inode *dir, if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL) && (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) { - nfs4_exclusive_attrset(opendata, sattr, &label); + unsigned attrs = nfs4_exclusive_attrset(opendata, sattr, &label); /* * send create attributes which was not set by open * with an extra setattr. */ - if (sattr->ia_valid & NFS4_VALID_ATTRS) { + if (attrs || label) { + unsigned ia_old = sattr->ia_valid; + + sattr->ia_valid = attrs; nfs_fattr_init(opendata->o_res.f_attr); status = nfs4_do_setattr(state->inode, cred, opendata->o_res.f_attr, sattr, @@ -2907,6 +2930,7 @@ static int _nfs4_do_open(struct inode *dir, opendata->o_res.f_attr); nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel); } + sattr->ia_valid = ia_old; } } if (opened && opendata->file_created) @@ -3874,6 +3898,10 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, if (IS_ERR(label)) return PTR_ERR(label); + /* Return any delegations if we're going to change ACLs */ + if ((sattr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) + nfs4_inode_make_writeable(inode); + status = nfs4_do_setattr(inode, cred, fattr, sattr, ctx, NULL, label); if (status == 0) { nfs_setattr_update_inode(inode, sattr, fattr); @@ -4048,7 +4076,6 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry struct nfs_server *server = NFS_SERVER(inode); struct nfs4_accessargs args = { .fh = NFS_FH(inode), - .bitmask = server->cache_consistency_bitmask, .access = entry->mask, }; struct nfs4_accessres res = { @@ -4062,14 +4089,18 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry }; int status = 0; - res.fattr = nfs_alloc_fattr(); - if (res.fattr == NULL) - return -ENOMEM; + if (!nfs_have_delegated_attributes(inode)) { + res.fattr = nfs_alloc_fattr(); + if (res.fattr == NULL) + return -ENOMEM; + args.bitmask = server->cache_consistency_bitmask; + } status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (!status) { nfs_access_set_mask(entry, res.access); - nfs_refresh_inode(inode, res.fattr); + if (res.fattr) + nfs_refresh_inode(inode, res.fattr); } nfs_free_fattr(res.fattr); return status; @@ -4199,10 +4230,32 @@ static int _nfs4_proc_remove(struct inode *dir, const struct qstr *name) return status; } -static int nfs4_proc_remove(struct inode *dir, const struct qstr *name) +static int nfs4_proc_remove(struct inode *dir, struct dentry *dentry) +{ + struct nfs4_exception exception = { }; + struct inode *inode = d_inode(dentry); + int err; + + if (inode) { + if (inode->i_nlink == 1) + nfs4_inode_return_delegation(inode); + else + nfs4_inode_make_writeable(inode); + } + do { + err = _nfs4_proc_remove(dir, &dentry->d_name); + trace_nfs4_remove(dir, &dentry->d_name, err); + err = nfs4_handle_exception(NFS_SERVER(dir), err, + &exception); + } while (exception.retry); + return err; +} + +static int nfs4_proc_rmdir(struct inode *dir, const struct qstr *name) { struct nfs4_exception exception = { }; int err; + do { err = _nfs4_proc_remove(dir, name); trace_nfs4_remove(dir, name, err); @@ -4212,17 +4265,20 @@ static int nfs4_proc_remove(struct inode *dir, const struct qstr *name) return err; } -static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) +static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct dentry *dentry) { - struct nfs_server *server = NFS_SERVER(dir); struct nfs_removeargs *args = msg->rpc_argp; struct nfs_removeres *res = msg->rpc_resp; + struct inode *inode = d_inode(dentry); - res->server = server; + res->server = NFS_SB(dentry->d_sb); msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; nfs4_init_sequence(&args->seq_args, &res->seq_res, 1); nfs_fattr_init(res->dir_attr); + + if (inode) + nfs4_inode_return_delegation(inode); } static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) @@ -4248,14 +4304,21 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) return 1; } -static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir) +static void nfs4_proc_rename_setup(struct rpc_message *msg, + struct dentry *old_dentry, + struct dentry *new_dentry) { - struct nfs_server *server = NFS_SERVER(dir); struct nfs_renameargs *arg = msg->rpc_argp; struct nfs_renameres *res = msg->rpc_resp; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); + if (old_inode) + nfs4_inode_make_writeable(old_inode); + if (new_inode) + nfs4_inode_return_delegation(new_inode); msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME]; - res->server = server; + res->server = NFS_SB(old_dentry->d_sb); nfs4_init_sequence(&arg->seq_args, &res->seq_res, 1); } @@ -4317,6 +4380,8 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct } arg.bitmask = nfs4_bitmask(server, res.label); + nfs4_inode_make_writeable(inode); + status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (!status) { update_changeattr(dir, &res.cinfo, res.fattr->time_start); @@ -5310,7 +5375,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl i = buf_to_pages_noslab(buf, buflen, arg.acl_pages); if (i < 0) return i; - nfs4_inode_return_delegation(inode); + nfs4_inode_make_writeable(inode); ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); /* @@ -5325,7 +5390,8 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl * so mark the attribute cache invalid. */ spin_lock(&inode->i_lock); - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR; + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_CHANGE + | NFS_INO_INVALID_CTIME; spin_unlock(&inode->i_lock); nfs_access_zap_cache(inode); nfs_zap_acl_cache(inode); @@ -6621,22 +6687,24 @@ static int nfs4_wake_lock_waiter(wait_queue_entry_t *wait, unsigned int mode, int flags, void *key) { int ret; - struct cb_notify_lock_args *cbnl = key; struct nfs4_lock_waiter *waiter = wait->private; - struct nfs_lowner *lowner = &cbnl->cbnl_owner, - *wowner = waiter->owner; - /* Only wake if the callback was for the same owner */ - if (lowner->clientid != wowner->clientid || - lowner->id != wowner->id || - lowner->s_dev != wowner->s_dev) - return 0; + /* NULL key means to wake up everyone */ + if (key) { + struct cb_notify_lock_args *cbnl = key; + struct nfs_lowner *lowner = &cbnl->cbnl_owner, + *wowner = waiter->owner; - /* Make sure it's for the right inode */ - if (nfs_compare_fh(NFS_FH(waiter->inode), &cbnl->cbnl_fh)) - return 0; + /* Only wake if the callback was for the same owner. */ + if (lowner->id != wowner->id || lowner->s_dev != wowner->s_dev) + return 0; - waiter->notified = true; + /* Make sure it's for the right inode */ + if (nfs_compare_fh(NFS_FH(waiter->inode), &cbnl->cbnl_fh)) + return 0; + + waiter->notified = true; + } /* override "private" so we can use default_wake_function */ wait->private = waiter->task; @@ -6673,6 +6741,7 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) add_wait_queue(q, &wait); while(!signalled()) { + waiter.notified = false; status = nfs4_proc_setlk(state, cmd, request); if ((status != -EAGAIN) || IS_SETLK(cmd)) break; @@ -8414,6 +8483,8 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf { switch(task->tk_status) { case 0: + wake_up_all(&clp->cl_lock_waitq); + /* Fallthrough */ case -NFS4ERR_COMPLETE_ALREADY: case -NFS4ERR_WRONG_CRED: /* What to do here? */ break; @@ -9593,7 +9664,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .link = nfs4_proc_link, .symlink = nfs4_proc_symlink, .mkdir = nfs4_proc_mkdir, - .rmdir = nfs4_proc_remove, + .rmdir = nfs4_proc_rmdir, .readdir = nfs4_proc_readdir, .mknod = nfs4_proc_mknod, .statfs = nfs4_proc_statfs, @@ -9614,7 +9685,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .close_context = nfs4_close_context, .open_context = nfs4_atomic_open, .have_delegation = nfs4_have_delegation, - .return_delegation = nfs4_inode_return_delegation, .alloc_client = nfs4_alloc_client, .init_client = nfs4_init_client, .free_client = nfs4_free_client, diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 91a4d4eeb235..c10a422efe6f 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -428,7 +428,6 @@ nfs4_insert_state_owner_locked(struct nfs4_state_owner *new) struct rb_node **p = &server->state_owners.rb_node, *parent = NULL; struct nfs4_state_owner *sp; - int err; while (*p != NULL) { parent = *p; @@ -445,9 +444,6 @@ nfs4_insert_state_owner_locked(struct nfs4_state_owner *new) return sp; } } - err = ida_get_new(&server->openowner_id, &new->so_seqid.owner_id); - if (err) - return ERR_PTR(err); rb_link_node(&new->so_server_node, parent, p); rb_insert_color(&new->so_server_node, &server->state_owners); return new; @@ -460,7 +456,6 @@ nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp) if (!RB_EMPTY_NODE(&sp->so_server_node)) rb_erase(&sp->so_server_node, &server->state_owners); - ida_remove(&server->openowner_id, sp->so_seqid.owner_id); } static void @@ -495,6 +490,12 @@ nfs4_alloc_state_owner(struct nfs_server *server, sp = kzalloc(sizeof(*sp), gfp_flags); if (!sp) return NULL; + sp->so_seqid.owner_id = ida_simple_get(&server->openowner_id, 0, 0, + gfp_flags); + if (sp->so_seqid.owner_id < 0) { + kfree(sp); + return NULL; + } sp->so_server = server; sp->so_cred = get_rpccred(cred); spin_lock_init(&sp->so_lock); @@ -526,6 +527,7 @@ static void nfs4_free_state_owner(struct nfs4_state_owner *sp) { nfs4_destroy_seqid_counter(&sp->so_seqid); put_rpccred(sp->so_cred); + ida_simple_remove(&sp->so_server->openowner_id, sp->so_seqid.owner_id); kfree(sp); } @@ -576,13 +578,9 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, new = nfs4_alloc_state_owner(server, cred, gfp_flags); if (new == NULL) goto out; - do { - if (ida_pre_get(&server->openowner_id, gfp_flags) == 0) - break; - spin_lock(&clp->cl_lock); - sp = nfs4_insert_state_owner_locked(new); - spin_unlock(&clp->cl_lock); - } while (sp == ERR_PTR(-EAGAIN)); + spin_lock(&clp->cl_lock); + sp = nfs4_insert_state_owner_locked(new); + spin_unlock(&clp->cl_lock); if (sp != new) nfs4_free_state_owner(new); out: diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 65c9c4175145..9b7392032321 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -52,7 +52,6 @@ #include <linux/nfs.h> #include <linux/nfs4.h> #include <linux/nfs_fs.h> -#include <linux/fs_struct.h> #include "nfs4_fs.h" #include "internal.h" @@ -99,6 +98,7 @@ static int nfs4_stat_to_errno(int); ((3+NFS4_FHSIZE) >> 2)) #define nfs4_fattr_bitmap_maxsz 4 #define encode_getattr_maxsz (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz) +#define nfstime4_maxsz (3) #define nfs4_name_maxsz (1 + ((3 + NFS4_MAXNAMLEN) >> 2)) #define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2)) #define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) @@ -113,7 +113,8 @@ static int nfs4_stat_to_errno(int); #define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8) /* This is based on getfattr, which uses the most attributes: */ #define nfs4_fattr_value_maxsz (1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \ - 3 + 3 + 3 + nfs4_owner_maxsz + \ + 3*nfstime4_maxsz + \ + nfs4_owner_maxsz + \ nfs4_group_maxsz + nfs4_label_maxsz + \ decode_mdsthreshold_maxsz)) #define nfs4_fattr_maxsz (nfs4_fattr_bitmap_maxsz + \ @@ -124,7 +125,8 @@ static int nfs4_stat_to_errno(int); nfs4_owner_maxsz + \ nfs4_group_maxsz + \ nfs4_label_maxsz + \ - 4 + 4) + 1 + nfstime4_maxsz + \ + 1 + nfstime4_maxsz) #define encode_savefh_maxsz (op_encode_hdr_maxsz) #define decode_savefh_maxsz (op_decode_hdr_maxsz) #define encode_restorefh_maxsz (op_encode_hdr_maxsz) @@ -958,6 +960,35 @@ static void encode_uint64(struct xdr_stream *xdr, u64 n) WARN_ON_ONCE(xdr_stream_encode_u64(xdr, n) < 0); } +static ssize_t xdr_encode_bitmap4(struct xdr_stream *xdr, + const __u32 *bitmap, size_t len) +{ + ssize_t ret; + + /* Trim empty words */ + while (len > 0 && bitmap[len-1] == 0) + len--; + ret = xdr_stream_encode_uint32_array(xdr, bitmap, len); + if (WARN_ON_ONCE(ret < 0)) + return ret; + return len; +} + +static size_t mask_bitmap4(const __u32 *bitmap, const __u32 *mask, + __u32 *res, size_t len) +{ + size_t i; + __u32 tmp; + + while (len > 0 && (bitmap[len-1] == 0 || mask[len-1] == 0)) + len--; + for (i = len; i-- > 0;) { + tmp = bitmap[i] & mask[i]; + res[i] = tmp; + } + return len; +} + static void encode_nfs4_seqid(struct xdr_stream *xdr, const struct nfs_seqid *seqid) { @@ -1012,6 +1043,14 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve encode_opaque_fixed(xdr, verf->data, NFS4_VERIFIER_SIZE); } +static __be32 * +xdr_encode_nfstime4(__be32 *p, const struct timespec *t) +{ + p = xdr_encode_hyper(p, (__s64)t->tv_sec); + *p++ = cpu_to_be32(t->tv_nsec); + return p; +} + static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs4_label *label, const umode_t *umask, @@ -1023,9 +1062,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, int owner_namelen = 0; int owner_grouplen = 0; __be32 *p; - unsigned i; uint32_t len = 0; - uint32_t bmval_len; uint32_t bmval[3] = { 0 }; /* @@ -1073,7 +1110,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, if (attrmask[1] & FATTR4_WORD1_TIME_ACCESS_SET) { if (iap->ia_valid & ATTR_ATIME_SET) { bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET; - len += 16; + len += 4 + (nfstime4_maxsz << 2); } else if (iap->ia_valid & ATTR_ATIME) { bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET; len += 4; @@ -1082,7 +1119,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, if (attrmask[1] & FATTR4_WORD1_TIME_MODIFY_SET) { if (iap->ia_valid & ATTR_MTIME_SET) { bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET; - len += 16; + len += 4 + (nfstime4_maxsz << 2); } else if (iap->ia_valid & ATTR_MTIME) { bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET; len += 4; @@ -1094,19 +1131,8 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, bmval[2] |= FATTR4_WORD2_SECURITY_LABEL; } - if (bmval[2] != 0) - bmval_len = 3; - else if (bmval[1] != 0) - bmval_len = 2; - else - bmval_len = 1; - - p = reserve_space(xdr, 4 + (bmval_len << 2) + 4 + len); - - *p++ = cpu_to_be32(bmval_len); - for (i = 0; i < bmval_len; i++) - *p++ = cpu_to_be32(bmval[i]); - *p++ = cpu_to_be32(len); + xdr_encode_bitmap4(xdr, bmval, ARRAY_SIZE(bmval)); + xdr_stream_encode_opaque_inline(xdr, (void **)&p, len); if (bmval[0] & FATTR4_WORD0_SIZE) p = xdr_encode_hyper(p, iap->ia_size); @@ -1119,16 +1145,14 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) { if (iap->ia_valid & ATTR_ATIME_SET) { *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); - p = xdr_encode_hyper(p, (s64)iap->ia_atime.tv_sec); - *p++ = cpu_to_be32(iap->ia_atime.tv_nsec); + p = xdr_encode_nfstime4(p, &iap->ia_atime); } else *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) { if (iap->ia_valid & ATTR_MTIME_SET) { *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); - p = xdr_encode_hyper(p, (s64)iap->ia_mtime.tv_sec); - *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec); + p = xdr_encode_nfstime4(p, &iap->ia_mtime); } else *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } @@ -1200,85 +1224,45 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg * create->server, create->server->attr_bitmask); } -static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr) -{ - __be32 *p; - - encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr); - p = reserve_space(xdr, 8); - *p++ = cpu_to_be32(1); - *p = cpu_to_be32(bitmap); -} - -static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1, struct compound_hdr *hdr) -{ - __be32 *p; - - encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr); - p = reserve_space(xdr, 12); - *p++ = cpu_to_be32(2); - *p++ = cpu_to_be32(bm0); - *p = cpu_to_be32(bm1); -} - -static void -encode_getattr_three(struct xdr_stream *xdr, - uint32_t bm0, uint32_t bm1, uint32_t bm2, - struct compound_hdr *hdr) +static void encode_getattr(struct xdr_stream *xdr, + const __u32 *bitmap, const __u32 *mask, size_t len, + struct compound_hdr *hdr) { - __be32 *p; + __u32 masked_bitmap[nfs4_fattr_bitmap_maxsz]; encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr); - if (bm2) { - p = reserve_space(xdr, 16); - *p++ = cpu_to_be32(3); - *p++ = cpu_to_be32(bm0); - *p++ = cpu_to_be32(bm1); - *p = cpu_to_be32(bm2); - } else if (bm1) { - p = reserve_space(xdr, 12); - *p++ = cpu_to_be32(2); - *p++ = cpu_to_be32(bm0); - *p = cpu_to_be32(bm1); - } else { - p = reserve_space(xdr, 8); - *p++ = cpu_to_be32(1); - *p = cpu_to_be32(bm0); + if (mask) { + if (WARN_ON_ONCE(len > ARRAY_SIZE(masked_bitmap))) + len = ARRAY_SIZE(masked_bitmap); + len = mask_bitmap4(bitmap, mask, masked_bitmap, len); + bitmap = masked_bitmap; } + xdr_encode_bitmap4(xdr, bitmap, len); } static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) { - encode_getattr_three(xdr, bitmask[0] & nfs4_fattr_bitmap[0], - bitmask[1] & nfs4_fattr_bitmap[1], - bitmask[2] & nfs4_fattr_bitmap[2], - hdr); + encode_getattr(xdr, nfs4_fattr_bitmap, bitmask, + ARRAY_SIZE(nfs4_fattr_bitmap), hdr); } static void encode_getfattr_open(struct xdr_stream *xdr, const u32 *bitmask, const u32 *open_bitmap, struct compound_hdr *hdr) { - encode_getattr_three(xdr, - bitmask[0] & open_bitmap[0], - bitmask[1] & open_bitmap[1], - bitmask[2] & open_bitmap[2], - hdr); + encode_getattr(xdr, open_bitmap, bitmask, 3, hdr); } static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) { - encode_getattr_three(xdr, - bitmask[0] & nfs4_fsinfo_bitmap[0], - bitmask[1] & nfs4_fsinfo_bitmap[1], - bitmask[2] & nfs4_fsinfo_bitmap[2], - hdr); + encode_getattr(xdr, nfs4_fsinfo_bitmap, bitmask, + ARRAY_SIZE(nfs4_fsinfo_bitmap), hdr); } static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) { - encode_getattr_two(xdr, bitmask[0] & nfs4_fs_locations_bitmap[0], - bitmask[1] & nfs4_fs_locations_bitmap[1], hdr); + encode_getattr(xdr, nfs4_fs_locations_bitmap, bitmask, + ARRAY_SIZE(nfs4_fs_locations_bitmap), hdr); } static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr) @@ -2117,7 +2101,8 @@ static void nfs4_xdr_enc_access(struct rpc_rqst *req, struct xdr_stream *xdr, encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); encode_access(xdr, args->access, &hdr); - encode_getfattr(xdr, args->bitmask, &hdr); + if (args->bitmask) + encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); } @@ -2559,13 +2544,17 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr, struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; + const __u32 nfs4_acl_bitmap[1] = { + [0] = FATTR4_WORD0_ACL, + }; uint32_t replen; encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); replen = hdr.replen + op_decode_hdr_maxsz; - encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr); + encode_getattr(xdr, nfs4_acl_bitmap, NULL, + ARRAY_SIZE(nfs4_acl_bitmap), &hdr); xdr_inline_pages(&req->rq_rcv_buf, replen << 2, args->acl_pages, 0, args->acl_len); @@ -2644,8 +2633,8 @@ static void nfs4_xdr_enc_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); - encode_getattr_one(xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0], - &hdr); + encode_getattr(xdr, nfs4_pathconf_bitmap, args->bitmask, + ARRAY_SIZE(nfs4_pathconf_bitmap), &hdr); encode_nops(&hdr); } @@ -2663,8 +2652,8 @@ static void nfs4_xdr_enc_statfs(struct rpc_rqst *req, struct xdr_stream *xdr, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); - encode_getattr_two(xdr, args->bitmask[0] & nfs4_statfs_bitmap[0], - args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr); + encode_getattr(xdr, nfs4_statfs_bitmap, args->bitmask, + ARRAY_SIZE(nfs4_statfs_bitmap), &hdr); encode_nops(&hdr); } @@ -2684,7 +2673,7 @@ static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fhandle, &hdr); - encode_getattr_three(xdr, bitmask[0], bitmask[1], bitmask[2], &hdr); + encode_getattr(xdr, bitmask, NULL, 3, &hdr); encode_nops(&hdr); } @@ -3218,34 +3207,27 @@ static int decode_ace(struct xdr_stream *xdr, void *ace) return -EIO; } -static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) +static ssize_t +decode_bitmap4(struct xdr_stream *xdr, uint32_t *bitmap, size_t sz) { - uint32_t bmlen; - __be32 *p; - - p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) - goto out_overflow; - bmlen = be32_to_cpup(p); + ssize_t ret; - bitmap[0] = bitmap[1] = bitmap[2] = 0; - p = xdr_inline_decode(xdr, (bmlen << 2)); - if (unlikely(!p)) - goto out_overflow; - if (bmlen > 0) { - bitmap[0] = be32_to_cpup(p++); - if (bmlen > 1) { - bitmap[1] = be32_to_cpup(p++); - if (bmlen > 2) - bitmap[2] = be32_to_cpup(p); - } - } - return 0; -out_overflow: + ret = xdr_stream_decode_uint32_array(xdr, bitmap, sz); + if (likely(ret >= 0)) + return ret; + if (ret == -EMSGSIZE) + return sz; print_overflow_msg(__func__, xdr); return -EIO; } +static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) +{ + ssize_t ret; + ret = decode_bitmap4(xdr, bitmap, 3); + return ret < 0 ? ret : 0; +} + static int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, unsigned int *savep) { __be32 *p; @@ -3981,7 +3963,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, bitmap[1] &= ~FATTR4_WORD1_OWNER; if (owner_name != NULL) { - len = decode_nfs4_string(xdr, owner_name, GFP_NOWAIT); + len = decode_nfs4_string(xdr, owner_name, GFP_NOIO); if (len <= 0) goto out; dprintk("%s: name=%s\n", __func__, owner_name->data); @@ -4016,7 +3998,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP; if (group_name != NULL) { - len = decode_nfs4_string(xdr, group_name, GFP_NOWAIT); + len = decode_nfs4_string(xdr, group_name, GFP_NOIO); if (len <= 0) goto out; dprintk("%s: name=%s\n", __func__, group_name->data); @@ -4156,19 +4138,25 @@ out_overflow: return -EIO; } +static __be32 * +xdr_decode_nfstime4(__be32 *p, struct timespec *t) +{ + __u64 sec; + + p = xdr_decode_hyper(p, &sec); + t-> tv_sec = (time_t)sec; + t->tv_nsec = be32_to_cpup(p++); + return p; +} + static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) { __be32 *p; - uint64_t sec; - uint32_t nsec; - p = xdr_inline_decode(xdr, 12); + p = xdr_inline_decode(xdr, nfstime4_maxsz << 2); if (unlikely(!p)) goto out_overflow; - p = xdr_decode_hyper(p, &sec); - nsec = be32_to_cpup(p); - time->tv_sec = (time_t)sec; - time->tv_nsec = (long)nsec; + xdr_decode_nfstime4(p, time); return 0; out_overflow: print_overflow_msg(__func__, xdr); @@ -5471,21 +5459,13 @@ decode_savefh(struct xdr_stream *xdr) static int decode_setattr(struct xdr_stream *xdr) { - __be32 *p; - uint32_t bmlen; int status; status = decode_op_hdr(xdr, OP_SETATTR); if (status) return status; - p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) - goto out_overflow; - bmlen = be32_to_cpup(p); - p = xdr_inline_decode(xdr, bmlen << 2); - if (likely(p)) + if (decode_bitmap4(xdr, NULL, 0) >= 0) return 0; -out_overflow: print_overflow_msg(__func__, xdr); return -EIO; } @@ -6256,7 +6236,8 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_access(xdr, &res->supported, &res->access); if (status != 0) goto out; - decode_getfattr(xdr, res->fattr, res->server); + if (res->fattr) + decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -7536,6 +7517,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, unsigned int savep; uint32_t bitmap[3] = {0}; uint32_t len; + uint64_t new_cookie; __be32 *p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; @@ -7552,8 +7534,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, p = xdr_inline_decode(xdr, 12); if (unlikely(!p)) goto out_overflow; - entry->prev_cookie = entry->cookie; - p = xdr_decode_hyper(p, &entry->cookie); + p = xdr_decode_hyper(p, &new_cookie); entry->len = be32_to_cpup(p); p = xdr_inline_decode(xdr, entry->len); @@ -7587,6 +7568,9 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE) entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); + entry->prev_cookie = entry->cookie; + entry->cookie = new_cookie; + return 0; out_overflow: diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index f7fd9192d4bc..4e93d6308733 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -300,11 +300,11 @@ out: } static int -nfs_proc_remove(struct inode *dir, const struct qstr *name) +nfs_proc_remove(struct inode *dir, struct dentry *dentry) { struct nfs_removeargs arg = { .fh = NFS_FH(dir), - .name = *name, + .name = dentry->d_name, }; struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_REMOVE], @@ -312,7 +312,7 @@ nfs_proc_remove(struct inode *dir, const struct qstr *name) }; int status; - dprintk("NFS call remove %s\n", name->name); + dprintk("NFS call remove %pd2\n",dentry); status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_mark_for_revalidate(dir); @@ -321,7 +321,7 @@ nfs_proc_remove(struct inode *dir, const struct qstr *name) } static void -nfs_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) +nfs_proc_unlink_setup(struct rpc_message *msg, struct dentry *dentry) { msg->rpc_proc = &nfs_procedures[NFSPROC_REMOVE]; } @@ -338,7 +338,9 @@ static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir) } static void -nfs_proc_rename_setup(struct rpc_message *msg, struct inode *dir) +nfs_proc_rename_setup(struct rpc_message *msg, + struct dentry *old_dentry, + struct dentry *new_dentry) { msg->rpc_proc = &nfs_procedures[NFSPROC_RENAME]; } @@ -671,12 +673,6 @@ static int nfs_have_delegation(struct inode *inode, fmode_t flags) return 0; } -static int nfs_return_delegation(struct inode *inode) -{ - nfs_wb_all(inode); - return 0; -} - static const struct inode_operations nfs_dir_inode_operations = { .create = nfs_create, .lookup = nfs_lookup, @@ -741,7 +737,6 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .lock_check_bounds = nfs_lock_check_bounds, .close_context = nfs_close_context, .have_delegation = nfs_have_delegation, - .return_delegation = nfs_return_delegation, .alloc_client = nfs_alloc_client, .init_client = nfs_init_client, .free_client = nfs_free_client, diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 630b4a3c1a93..bf54fc9ae135 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -105,7 +105,7 @@ static void nfs_do_call_unlink(struct nfs_unlinkdata *data) data->args.fh = NFS_FH(dir); nfs_fattr_init(data->res.dir_attr); - NFS_PROTO(dir)->unlink_setup(&msg, dir); + NFS_PROTO(dir)->unlink_setup(&msg, data->dentry); task_setup_data.rpc_client = NFS_CLIENT(dir); task = rpc_run_task(&task_setup_data); @@ -386,7 +386,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir, nfs_sb_active(old_dir->i_sb); - NFS_PROTO(data->old_dir)->rename_setup(&msg, old_dir); + NFS_PROTO(data->old_dir)->rename_setup(&msg, old_dentry, new_dentry); return rpc_run_task(&task_setup_data); } @@ -463,9 +463,6 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) fileid = NFS_FILEID(d_inode(dentry)); - /* Return delegation in anticipation of the rename */ - NFS_PROTO(d_inode(dentry))->return_delegation(d_inode(dentry)); - sdentry = NULL; do { int slen; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 6579f3b367bd..0193053bc139 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -231,6 +231,7 @@ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int c if (i_size >= end) goto out; i_size_write(inode, end); + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE; nfs_inc_stats(inode, NFSIOS_EXTENDWRITE); out: spin_unlock(&inode->i_lock); @@ -1562,8 +1563,11 @@ static int nfs_writeback_done(struct rpc_task *task, } /* Deal with the suid/sgid bit corner case */ - if (nfs_should_remove_suid(inode)) - nfs_mark_for_revalidate(inode); + if (nfs_should_remove_suid(inode)) { + spin_lock(&inode->i_lock); + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_OTHER; + spin_unlock(&inode->i_lock); + } return 0; } diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 1d0ce3c57d93..6259a4b8579f 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -192,6 +192,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp) struct nfsd3_writeres *resp = rqstp->rq_resp; __be32 nfserr; unsigned long cnt = argp->len; + unsigned int nvecs; dprintk("nfsd: WRITE(3) %s %d bytes at %Lu%s\n", SVCFH_fmt(&argp->fh), @@ -201,9 +202,12 @@ nfsd3_proc_write(struct svc_rqst *rqstp) fh_copy(&resp->fh, &argp->fh); resp->committed = argp->stable; + nvecs = svc_fill_write_vector(rqstp, &argp->first, cnt); + if (!nvecs) + RETURN_STATUS(nfserr_io); nfserr = nfsd_write(rqstp, &resp->fh, argp->offset, - rqstp->rq_vec, argp->vlen, - &cnt, resp->committed); + rqstp->rq_vec, nvecs, &cnt, + resp->committed); resp->count = cnt; RETURN_STATUS(nfserr); } @@ -279,6 +283,16 @@ nfsd3_proc_symlink(struct svc_rqst *rqstp) struct nfsd3_diropres *resp = rqstp->rq_resp; __be32 nfserr; + if (argp->tlen == 0) + RETURN_STATUS(nfserr_inval); + if (argp->tlen > NFS3_MAXPATHLEN) + RETURN_STATUS(nfserr_nametoolong); + + argp->tname = svc_fill_symlink_pathname(rqstp, &argp->first, + argp->tlen); + if (IS_ERR(argp->tname)) + RETURN_STATUS(nfserrno(PTR_ERR(argp->tname))); + dprintk("nfsd: SYMLINK(3) %s %.*s -> %.*s\n", SVCFH_fmt(&argp->ffh), argp->flen, argp->fname, diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 1a70581e1cb2..3192b544a441 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -391,7 +391,7 @@ int nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_writeargs *args = rqstp->rq_argp; - unsigned int len, v, hdr, dlen; + unsigned int len, hdr, dlen; u32 max_blocksize = svc_max_payload(rqstp); struct kvec *head = rqstp->rq_arg.head; struct kvec *tail = rqstp->rq_arg.tail; @@ -433,17 +433,9 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p) args->count = max_blocksize; len = args->len = max_blocksize; } - rqstp->rq_vec[0].iov_base = (void*)p; - rqstp->rq_vec[0].iov_len = head->iov_len - hdr; - v = 0; - while (len > rqstp->rq_vec[v].iov_len) { - len -= rqstp->rq_vec[v].iov_len; - v++; - rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_pages[v]); - rqstp->rq_vec[v].iov_len = PAGE_SIZE; - } - rqstp->rq_vec[v].iov_len = len; - args->vlen = v + 1; + + args->first.iov_base = (void *)p; + args->first.iov_len = head->iov_len - hdr; return 1; } @@ -489,51 +481,24 @@ int nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_symlinkargs *args = rqstp->rq_argp; - unsigned int len, avail; - char *old, *new; - struct kvec *vec; + char *base = (char *)p; + size_t dlen; if (!(p = decode_fh(p, &args->ffh)) || - !(p = decode_filename(p, &args->fname, &args->flen)) - ) + !(p = decode_filename(p, &args->fname, &args->flen))) return 0; p = decode_sattr3(p, &args->attrs); - /* now decode the pathname, which might be larger than the first page. - * As we have to check for nul's anyway, we copy it into a new page - * This page appears in the rq_res.pages list, but as pages_len is always - * 0, it won't get in the way - */ - len = ntohl(*p++); - if (len == 0 || len > NFS3_MAXPATHLEN || len >= PAGE_SIZE) - return 0; - args->tname = new = page_address(*(rqstp->rq_next_page++)); - args->tlen = len; - /* first copy and check from the first page */ - old = (char*)p; - vec = &rqstp->rq_arg.head[0]; - if ((void *)old > vec->iov_base + vec->iov_len) - return 0; - avail = vec->iov_len - (old - (char*)vec->iov_base); - while (len && avail && *old) { - *new++ = *old++; - len--; - avail--; - } - /* now copy next page if there is one */ - if (len && !avail && rqstp->rq_arg.page_len) { - avail = min_t(unsigned int, rqstp->rq_arg.page_len, PAGE_SIZE); - old = page_address(rqstp->rq_arg.pages[0]); - } - while (len && avail && *old) { - *new++ = *old++; - len--; - avail--; - } - *new = '\0'; - if (len) - return 0; + args->tlen = ntohl(*p++); + + args->first.iov_base = p; + args->first.iov_len = rqstp->rq_arg.head[0].iov_len; + args->first.iov_len -= (char *)p - base; + dlen = args->first.iov_len + rqstp->rq_arg.page_len + + rqstp->rq_arg.tail[0].iov_len; + if (dlen < XDR_QUADLEN(args->tlen) << 2) + return 0; return 1; } diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 49b0a9e7ff18..1f04d2a70d25 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -223,8 +223,8 @@ static int nfs_cb_stat_to_errno(int status) return -status; } -static int decode_cb_op_status(struct xdr_stream *xdr, enum nfs_opnum4 expected, - int *status) +static int decode_cb_op_status(struct xdr_stream *xdr, + enum nfs_cb_opnum4 expected, int *status) { __be32 *p; u32 op; diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 7d888369f85a..228faf00a594 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -165,7 +165,7 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid) struct nfs4_client *clp = ls->ls_stid.sc_client; struct nfs4_file *fp = ls->ls_stid.sc_file; - trace_layoutstate_free(&ls->ls_stid.sc_stateid); + trace_nfsd_layoutstate_free(&ls->ls_stid.sc_stateid); spin_lock(&clp->cl_lock); list_del_init(&ls->ls_perclnt); @@ -264,7 +264,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, list_add(&ls->ls_perfile, &fp->fi_lo_states); spin_unlock(&fp->fi_lock); - trace_layoutstate_alloc(&ls->ls_stid.sc_stateid); + trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid); return ls; } @@ -334,7 +334,7 @@ nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls) if (list_empty(&ls->ls_layouts)) goto out_unlock; - trace_layout_recall(&ls->ls_stid.sc_stateid); + trace_nfsd_layout_recall(&ls->ls_stid.sc_stateid); refcount_inc(&ls->ls_stid.sc_count); nfsd4_run_cb(&ls->ls_recall); @@ -507,7 +507,7 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp, false, lrp->lr_layout_type, &ls); if (nfserr) { - trace_layout_return_lookup_fail(&lrp->lr_sid); + trace_nfsd_layout_return_lookup_fail(&lrp->lr_sid); return nfserr; } @@ -523,7 +523,7 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp, nfs4_inc_and_copy_stateid(&lrp->lr_sid, &ls->ls_stid); lrp->lrs_present = 1; } else { - trace_layoutstate_unhash(&ls->ls_stid.sc_stateid); + trace_nfsd_layoutstate_unhash(&ls->ls_stid.sc_stateid); nfs4_unhash_stid(&ls->ls_stid); lrp->lrs_present = 0; } @@ -694,7 +694,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) /* * Unknown error or non-responding client, we'll need to fence. */ - trace_layout_recall_fail(&ls->ls_stid.sc_stateid); + trace_nfsd_layout_recall_fail(&ls->ls_stid.sc_stateid); ops = nfsd4_layout_ops[ls->ls_layout_type]; if (ops->fence_client) @@ -703,7 +703,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) nfsd4_cb_layout_fail(ls); return -1; case -NFS4ERR_NOMATCHING_LAYOUT: - trace_layout_recall_done(&ls->ls_stid.sc_stateid); + trace_nfsd_layout_recall_done(&ls->ls_stid.sc_stateid); task->tk_status = 0; return 1; } @@ -716,7 +716,7 @@ nfsd4_cb_layout_release(struct nfsd4_callback *cb) container_of(cb, struct nfs4_layout_stateid, ls_recall); LIST_HEAD(reaplist); - trace_layout_recall_release(&ls->ls_stid.sc_stateid); + trace_nfsd_layout_recall_release(&ls->ls_stid.sc_stateid); nfsd4_return_all_layouts(ls, &reaplist); nfsd4_free_layouts(&reaplist); diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index a0bed2b2004d..5d99e8810b85 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -32,6 +32,7 @@ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include <linux/fs_struct.h> #include <linux/file.h> #include <linux/falloc.h> #include <linux/slab.h> @@ -252,11 +253,13 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru * Note: create modes (UNCHECKED,GUARDED...) are the same * in NFSv4 as in v3 except EXCLUSIVE4_1. */ + current->fs->umask = open->op_umask; status = do_nfsd_create(rqstp, current_fh, open->op_fname.data, open->op_fname.len, &open->op_iattr, *resfh, open->op_createmode, (u32 *)open->op_verf.data, &open->op_truncate, &open->op_created); + current->fs->umask = 0; if (!status && open->op_label.len) nfsd4_security_inode_setsecctx(*resfh, &open->op_label, open->op_bmval); @@ -603,6 +606,7 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) return status; + current->fs->umask = create->cr_umask; switch (create->cr_type) { case NF4LNK: status = nfsd_symlink(rqstp, &cstate->current_fh, @@ -611,20 +615,22 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, break; case NF4BLK: + status = nfserr_inval; rdev = MKDEV(create->cr_specdata1, create->cr_specdata2); if (MAJOR(rdev) != create->cr_specdata1 || MINOR(rdev) != create->cr_specdata2) - return nfserr_inval; + goto out_umask; status = nfsd_create(rqstp, &cstate->current_fh, create->cr_name, create->cr_namelen, &create->cr_iattr, S_IFBLK, rdev, &resfh); break; case NF4CHR: + status = nfserr_inval; rdev = MKDEV(create->cr_specdata1, create->cr_specdata2); if (MAJOR(rdev) != create->cr_specdata1 || MINOR(rdev) != create->cr_specdata2) - return nfserr_inval; + goto out_umask; status = nfsd_create(rqstp, &cstate->current_fh, create->cr_name, create->cr_namelen, &create->cr_iattr,S_IFCHR, rdev, &resfh); @@ -668,6 +674,8 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fh_dup2(&cstate->current_fh, &resfh); out: fh_put(&resfh); +out_umask: + current->fs->umask = 0; return status; } @@ -751,6 +759,9 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (read->rd_offset >= OFFSET_MAX) return nfserr_inval; + trace_nfsd_read_start(rqstp, &cstate->current_fh, + read->rd_offset, read->rd_length); + /* * If we do a zero copy read, then a client will see read data * that reflects the state of the file *after* performing the @@ -783,6 +794,8 @@ nfsd4_read_release(union nfsd4_op_u *u) { if (u->read.rd_filp) fput(u->read.rd_filp); + trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp, + u->read.rd_offset, u->read.rd_length); } static __be32 @@ -1001,6 +1014,9 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (write->wr_offset >= OFFSET_MAX) return nfserr_inval; + cnt = write->wr_buflen; + trace_nfsd_write_start(rqstp, &cstate->current_fh, + write->wr_offset, cnt); status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, stateid, WR_STATE, &filp, NULL); if (status) { @@ -1008,7 +1024,6 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return status; } - cnt = write->wr_buflen; write->wr_how_written = write->wr_stable_how; gen_boot_verifier(&write->wr_verifier, SVC_NET(rqstp)); @@ -1021,7 +1036,8 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fput(filp); write->wr_bytes_written = cnt; - + trace_nfsd_write_done(rqstp, &cstate->current_fh, + write->wr_offset, cnt); return status; } @@ -1106,7 +1122,6 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, else { copy->cp_res.wr_bytes_written = bytes; copy->cp_res.wr_stable_how = NFS_UNSTABLE; - copy->cp_consecutive = 1; copy->cp_synchronous = 1; gen_boot_verifier(©->cp_res.wr_verifier, SVC_NET(rqstp)); status = nfs_ok; @@ -1412,7 +1427,7 @@ nfsd4_layoutget(struct svc_rqst *rqstp, nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lgp->lg_sid, true, lgp->lg_layout_type, &ls); if (nfserr) { - trace_layout_get_lookup_fail(&lgp->lg_sid); + trace_nfsd_layout_get_lookup_fail(&lgp->lg_sid); goto out; } @@ -1481,7 +1496,7 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp, false, lcp->lc_layout_type, &ls); if (nfserr) { - trace_layout_commit_lookup_fail(&lcp->lc_sid); + trace_nfsd_layout_commit_lookup_fail(&lcp->lc_sid); /* fixup error code as per RFC5661 */ if (nfserr == nfserr_bad_stateid) nfserr = nfserr_badlayout; @@ -1714,12 +1729,10 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) goto encode_op; } + trace_nfsd_compound(rqstp, args->opcnt); while (!status && resp->opcnt < args->opcnt) { op = &args->ops[resp->opcnt++]; - dprintk("nfsv4 compound op #%d/%d: %d (%s)\n", - resp->opcnt, args->opcnt, op->opnum, - nfsd4_op_name(op->opnum)); /* * The XDR decode routines may have pre-set op->status; * for example, if there is a miscellaneous XDR error @@ -1793,9 +1806,8 @@ encode_op: status = op->status; } - dprintk("nfsv4 compound op %p opcnt %d #%d: %d: status %d\n", - args->ops, args->opcnt, resp->opcnt, op->opnum, - be32_to_cpu(status)); + trace_nfsd_compound_status(args->opcnt, resp->opcnt, status, + nfsd4_op_name(op->opnum)); nfsd4_cstate_clear_replay(cstate); nfsd4_increment_op_stats(op->opnum); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 61b770e39809..fc74d6f46bd5 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -98,6 +98,7 @@ enum nfsd4_st_mutex_lock_subclass { */ static DECLARE_WAIT_QUEUE_HEAD(close_wq); +static struct kmem_cache *client_slab; static struct kmem_cache *openowner_slab; static struct kmem_cache *lockowner_slab; static struct kmem_cache *file_slab; @@ -806,7 +807,8 @@ static void block_delegations(struct knfsd_fh *fh) } static struct nfs4_delegation * -alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh, +alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp, + struct svc_fh *current_fh, struct nfs4_clnt_odstate *odstate) { struct nfs4_delegation *dp; @@ -837,6 +839,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh, dp->dl_retries = 1; nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client, &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL); + get_nfs4_file(fp); + dp->dl_stid.sc_file = fp; return dp; out_dec: atomic_long_dec(&num_delegations); @@ -874,19 +878,35 @@ nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid) spin_unlock(&stid->sc_lock); } -static void nfs4_put_deleg_lease(struct nfs4_file *fp) +static void put_deleg_file(struct nfs4_file *fp) { struct file *filp = NULL; spin_lock(&fp->fi_lock); - if (fp->fi_deleg_file && --fp->fi_delegees == 0) + if (--fp->fi_delegees == 0) swap(filp, fp->fi_deleg_file); spin_unlock(&fp->fi_lock); - if (filp) { - vfs_setlease(filp, F_UNLCK, NULL, (void **)&fp); + if (filp) fput(filp); - } +} + +static void nfs4_unlock_deleg_lease(struct nfs4_delegation *dp) +{ + struct nfs4_file *fp = dp->dl_stid.sc_file; + struct file *filp = fp->fi_deleg_file; + + WARN_ON_ONCE(!fp->fi_delegees); + + vfs_setlease(filp, F_UNLCK, NULL, (void **)&dp); + put_deleg_file(fp); +} + +static void destroy_unhashed_deleg(struct nfs4_delegation *dp) +{ + put_clnt_odstate(dp->dl_clnt_odstate); + nfs4_unlock_deleg_lease(dp); + nfs4_put_stid(&dp->dl_stid); } void nfs4_unhash_stid(struct nfs4_stid *s) @@ -895,20 +915,16 @@ void nfs4_unhash_stid(struct nfs4_stid *s) } /** - * nfs4_get_existing_delegation - Discover if this delegation already exists + * nfs4_delegation_exists - Discover if this delegation already exists * @clp: a pointer to the nfs4_client we're granting a delegation to * @fp: a pointer to the nfs4_file we're granting a delegation on * * Return: - * On success: NULL if an existing delegation was not found. - * - * On error: -EAGAIN if one was previously granted to this nfs4_client - * for this nfs4_file. - * + * On success: true iff an existing delegation is found */ -static int -nfs4_get_existing_delegation(struct nfs4_client *clp, struct nfs4_file *fp) +static bool +nfs4_delegation_exists(struct nfs4_client *clp, struct nfs4_file *fp) { struct nfs4_delegation *searchdp = NULL; struct nfs4_client *searchclp = NULL; @@ -919,10 +935,10 @@ nfs4_get_existing_delegation(struct nfs4_client *clp, struct nfs4_file *fp) list_for_each_entry(searchdp, &fp->fi_delegations, dl_perfile) { searchclp = searchdp->dl_stid.sc_client; if (clp == searchclp) { - return -EAGAIN; + return true; } } - return 0; + return false; } /** @@ -941,16 +957,13 @@ nfs4_get_existing_delegation(struct nfs4_client *clp, struct nfs4_file *fp) static int hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp) { - int status; struct nfs4_client *clp = dp->dl_stid.sc_client; lockdep_assert_held(&state_lock); lockdep_assert_held(&fp->fi_lock); - status = nfs4_get_existing_delegation(clp, fp); - if (status) - return status; - ++fp->fi_delegees; + if (nfs4_delegation_exists(clp, fp)) + return -EAGAIN; refcount_inc(&dp->dl_stid.sc_count); dp->dl_stid.sc_type = NFS4_DELEG_STID; list_add(&dp->dl_perfile, &fp->fi_delegations); @@ -986,11 +999,8 @@ static void destroy_delegation(struct nfs4_delegation *dp) spin_lock(&state_lock); unhashed = unhash_delegation_locked(dp); spin_unlock(&state_lock); - if (unhashed) { - put_clnt_odstate(dp->dl_clnt_odstate); - nfs4_put_deleg_lease(dp->dl_stid.sc_file); - nfs4_put_stid(&dp->dl_stid); - } + if (unhashed) + destroy_unhashed_deleg(dp); } static void revoke_delegation(struct nfs4_delegation *dp) @@ -999,17 +1009,14 @@ static void revoke_delegation(struct nfs4_delegation *dp) WARN_ON(!list_empty(&dp->dl_recall_lru)); - put_clnt_odstate(dp->dl_clnt_odstate); - nfs4_put_deleg_lease(dp->dl_stid.sc_file); - - if (clp->cl_minorversion == 0) - nfs4_put_stid(&dp->dl_stid); - else { + if (clp->cl_minorversion) { dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID; + refcount_inc(&dp->dl_stid.sc_count); spin_lock(&clp->cl_lock); list_add(&dp->dl_recall_lru, &clp->cl_revoked); spin_unlock(&clp->cl_lock); } + destroy_unhashed_deleg(dp); } /* @@ -1794,7 +1801,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) struct nfs4_client *clp; int i; - clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL); + clp = kmem_cache_zalloc(client_slab, GFP_KERNEL); if (clp == NULL) return NULL; clp->cl_name.data = kmemdup(name.data, name.len, GFP_KERNEL); @@ -1825,7 +1832,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) err_no_hashtbl: kfree(clp->cl_name.data); err_no_name: - kfree(clp); + kmem_cache_free(client_slab, clp); return NULL; } @@ -1845,7 +1852,7 @@ free_client(struct nfs4_client *clp) kfree(clp->cl_ownerstr_hashtbl); kfree(clp->cl_name.data); idr_destroy(&clp->cl_stateids); - kfree(clp); + kmem_cache_free(client_slab, clp); } /* must be called under the client_lock */ @@ -1911,9 +1918,7 @@ __destroy_client(struct nfs4_client *clp) while (!list_empty(&reaplist)) { dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); list_del_init(&dp->dl_recall_lru); - put_clnt_odstate(dp->dl_clnt_odstate); - nfs4_put_deleg_lease(dp->dl_stid.sc_file); - nfs4_put_stid(&dp->dl_stid); + destroy_unhashed_deleg(dp); } while (!list_empty(&clp->cl_revoked)) { dp = list_entry(clp->cl_revoked.next, struct nfs4_delegation, dl_recall_lru); @@ -2953,7 +2958,7 @@ out_no_session: static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid) { if (!session) - return 0; + return false; return !memcmp(sid, &session->se_sessionid, sizeof(*sid)); } @@ -3471,21 +3476,26 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval, void nfsd4_free_slabs(void) { - kmem_cache_destroy(odstate_slab); + kmem_cache_destroy(client_slab); kmem_cache_destroy(openowner_slab); kmem_cache_destroy(lockowner_slab); kmem_cache_destroy(file_slab); kmem_cache_destroy(stateid_slab); kmem_cache_destroy(deleg_slab); + kmem_cache_destroy(odstate_slab); } int nfsd4_init_slabs(void) { + client_slab = kmem_cache_create("nfsd4_clients", + sizeof(struct nfs4_client), 0, 0, NULL); + if (client_slab == NULL) + goto out; openowner_slab = kmem_cache_create("nfsd4_openowners", sizeof(struct nfs4_openowner), 0, 0, NULL); if (openowner_slab == NULL) - goto out; + goto out_free_client_slab; lockowner_slab = kmem_cache_create("nfsd4_lockowners", sizeof(struct nfs4_lockowner), 0, 0, NULL); if (lockowner_slab == NULL) @@ -3518,6 +3528,8 @@ out_free_lockowner_slab: kmem_cache_destroy(lockowner_slab); out_free_openowner_slab: kmem_cache_destroy(openowner_slab); +out_free_client_slab: + kmem_cache_destroy(client_slab); out: dprintk("nfsd4: out of memory while initializing nfsv4\n"); return -ENOMEM; @@ -3945,17 +3957,9 @@ static bool nfsd_break_deleg_cb(struct file_lock *fl) { bool ret = false; - struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner; - struct nfs4_delegation *dp; + struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner; + struct nfs4_file *fp = dp->dl_stid.sc_file; - if (!fp) { - WARN(1, "(%p)->fl_owner NULL\n", fl); - return ret; - } - if (fp->fi_had_conflict) { - WARN(1, "duplicate break on %p\n", fp); - return ret; - } /* * We don't want the locks code to timeout the lease for us; * we'll remove it ourself if a delegation isn't returned @@ -3965,15 +3969,7 @@ nfsd_break_deleg_cb(struct file_lock *fl) spin_lock(&fp->fi_lock); fp->fi_had_conflict = true; - /* - * If there are no delegations on the list, then return true - * so that the lease code will go ahead and delete it. - */ - if (list_empty(&fp->fi_delegations)) - ret = true; - else - list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) - nfsd_break_one_deleg(dp); + nfsd_break_one_deleg(dp); spin_unlock(&fp->fi_lock); return ret; } @@ -4297,7 +4293,8 @@ static bool nfsd4_cb_channel_good(struct nfs4_client *clp) return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN; } -static struct file_lock *nfs4_alloc_init_lease(struct nfs4_file *fp, int flag) +static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, + int flag) { struct file_lock *fl; @@ -4308,124 +4305,88 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_file *fp, int flag) fl->fl_flags = FL_DELEG; fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; fl->fl_end = OFFSET_MAX; - fl->fl_owner = (fl_owner_t)fp; + fl->fl_owner = (fl_owner_t)dp; fl->fl_pid = current->tgid; + fl->fl_file = dp->dl_stid.sc_file->fi_deleg_file; return fl; } -/** - * nfs4_setlease - Obtain a delegation by requesting lease from vfs layer - * @dp: a pointer to the nfs4_delegation we're adding. - * - * Return: - * On success: Return code will be 0 on success. - * - * On error: -EAGAIN if there was an existing delegation. - * nonzero if there is an error in other cases. - * - */ - -static int nfs4_setlease(struct nfs4_delegation *dp) -{ - struct nfs4_file *fp = dp->dl_stid.sc_file; - struct file_lock *fl; - struct file *filp; - int status = 0; - - fl = nfs4_alloc_init_lease(fp, NFS4_OPEN_DELEGATE_READ); - if (!fl) - return -ENOMEM; - filp = find_readable_file(fp); - if (!filp) { - /* We should always have a readable file here */ - WARN_ON_ONCE(1); - locks_free_lock(fl); - return -EBADF; - } - fl->fl_file = filp; - status = vfs_setlease(filp, fl->fl_type, &fl, NULL); - if (fl) - locks_free_lock(fl); - if (status) - goto out_fput; - spin_lock(&state_lock); - spin_lock(&fp->fi_lock); - /* Did the lease get broken before we took the lock? */ - status = -EAGAIN; - if (fp->fi_had_conflict) - goto out_unlock; - /* Race breaker */ - if (fp->fi_deleg_file) { - status = hash_delegation_locked(dp, fp); - goto out_unlock; - } - fp->fi_deleg_file = filp; - fp->fi_delegees = 0; - status = hash_delegation_locked(dp, fp); - spin_unlock(&fp->fi_lock); - spin_unlock(&state_lock); - if (status) { - /* Should never happen, this is a new fi_deleg_file */ - WARN_ON_ONCE(1); - goto out_fput; - } - return 0; -out_unlock: - spin_unlock(&fp->fi_lock); - spin_unlock(&state_lock); -out_fput: - fput(filp); - return status; -} - static struct nfs4_delegation * nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, struct nfs4_file *fp, struct nfs4_clnt_odstate *odstate) { - int status; + int status = 0; struct nfs4_delegation *dp; + struct file *filp; + struct file_lock *fl; + /* + * The fi_had_conflict and nfs_get_existing_delegation checks + * here are just optimizations; we'll need to recheck them at + * the end: + */ if (fp->fi_had_conflict) return ERR_PTR(-EAGAIN); + filp = find_readable_file(fp); + if (!filp) { + /* We should always have a readable file here */ + WARN_ON_ONCE(1); + return ERR_PTR(-EBADF); + } spin_lock(&state_lock); spin_lock(&fp->fi_lock); - status = nfs4_get_existing_delegation(clp, fp); + if (nfs4_delegation_exists(clp, fp)) + status = -EAGAIN; + else if (!fp->fi_deleg_file) { + fp->fi_deleg_file = filp; + /* increment early to prevent fi_deleg_file from being + * cleared */ + fp->fi_delegees = 1; + filp = NULL; + } else + fp->fi_delegees++; spin_unlock(&fp->fi_lock); spin_unlock(&state_lock); - + if (filp) + fput(filp); if (status) return ERR_PTR(status); - dp = alloc_init_deleg(clp, fh, odstate); + status = -ENOMEM; + dp = alloc_init_deleg(clp, fp, fh, odstate); if (!dp) - return ERR_PTR(-ENOMEM); + goto out_delegees; + + fl = nfs4_alloc_init_lease(dp, NFS4_OPEN_DELEGATE_READ); + if (!fl) + goto out_stid; + + status = vfs_setlease(fp->fi_deleg_file, fl->fl_type, &fl, NULL); + if (fl) + locks_free_lock(fl); + if (status) + goto out_clnt_odstate; - get_nfs4_file(fp); spin_lock(&state_lock); spin_lock(&fp->fi_lock); - dp->dl_stid.sc_file = fp; - if (!fp->fi_deleg_file) { - spin_unlock(&fp->fi_lock); - spin_unlock(&state_lock); - status = nfs4_setlease(dp); - goto out; - } - if (fp->fi_had_conflict) { + if (fp->fi_had_conflict) status = -EAGAIN; - goto out_unlock; - } - status = hash_delegation_locked(dp, fp); -out_unlock: + else + status = hash_delegation_locked(dp, fp); spin_unlock(&fp->fi_lock); spin_unlock(&state_lock); -out: - if (status) { - put_clnt_odstate(dp->dl_clnt_odstate); - nfs4_put_stid(&dp->dl_stid); - return ERR_PTR(status); - } + + if (status) + destroy_unhashed_deleg(dp); return dp; +out_clnt_odstate: + put_clnt_odstate(dp->dl_clnt_odstate); +out_stid: + nfs4_put_stid(&dp->dl_stid); +out_delegees: + put_deleg_file(fp); + return ERR_PTR(status); } static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status) @@ -5521,15 +5482,26 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; stp->st_stid.sc_type = NFS4_CLOSED_STID; + + /* + * Technically we don't _really_ have to increment or copy it, since + * it should just be gone after this operation and we clobber the + * copied value below, but we continue to do so here just to ensure + * that racing ops see that there was a state change. + */ nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid); nfsd4_close_open_stateid(stp); mutex_unlock(&stp->st_mutex); - /* See RFC5661 sectionm 18.2.4 */ - if (stp->st_stid.sc_client->cl_minorversion) - memcpy(&close->cl_stateid, &close_stateid, - sizeof(close->cl_stateid)); + /* v4.1+ suggests that we send a special stateid in here, since the + * clients should just ignore this anyway. Since this is not useful + * for v4.0 clients either, we set it to the special close_stateid + * universally. + * + * See RFC5661 section 18.2.4, and RFC7530 section 16.2.5 + */ + memcpy(&close->cl_stateid, &close_stateid, sizeof(close->cl_stateid)); /* put reference from nfs4_preprocess_seqid_op */ nfs4_put_stid(&stp->st_stid); @@ -7264,9 +7236,7 @@ nfs4_state_shutdown_net(struct net *net) list_for_each_safe(pos, next, &reaplist) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); list_del_init(&dp->dl_recall_lru); - put_clnt_odstate(dp->dl_clnt_odstate); - nfs4_put_deleg_lease(dp->dl_stid.sc_file); - nfs4_put_stid(&dp->dl_stid); + destroy_unhashed_deleg(dp); } nfsd4_client_tracking_exit(net); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index e502fd16246b..1d048dd95464 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -33,7 +33,6 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include <linux/fs_struct.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/namei.h> @@ -682,7 +681,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, &create->cr_acl, &create->cr_label, - ¤t->fs->umask); + &create->cr_umask); if (status) goto out; @@ -927,7 +926,6 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) case NFS4_OPEN_NOCREATE: break; case NFS4_OPEN_CREATE: - current->fs->umask = 0; READ_BUF(4); open->op_createmode = be32_to_cpup(p++); switch (open->op_createmode) { @@ -935,7 +933,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) case NFS4_CREATE_GUARDED: status = nfsd4_decode_fattr(argp, open->op_bmval, &open->op_iattr, &open->op_acl, &open->op_label, - ¤t->fs->umask); + &open->op_umask); if (status) goto out; break; @@ -950,7 +948,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE); status = nfsd4_decode_fattr(argp, open->op_bmval, &open->op_iattr, &open->op_acl, &open->op_label, - ¤t->fs->umask); + &open->op_umask); if (status) goto out; break; @@ -1759,7 +1757,7 @@ nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy) p = xdr_decode_hyper(p, ©->cp_src_pos); p = xdr_decode_hyper(p, ©->cp_dst_pos); p = xdr_decode_hyper(p, ©->cp_count); - copy->cp_consecutive = be32_to_cpup(p++); + p++; /* ca_consecutive: we always do consecutive copies */ copy->cp_synchronous = be32_to_cpup(p++); tmp = be32_to_cpup(p); /* Source server list not supported */ @@ -3427,8 +3425,9 @@ static __be32 nfsd4_encode_splice_read( return nfserr_resource; len = maxcount; - nfserr = nfsd_splice_read(read->rd_rqstp, file, - read->rd_offset, &maxcount); + nfserr = nfsd_splice_read(read->rd_rqstp, read->rd_fhp, + file, read->rd_offset, &maxcount); + read->rd_length = maxcount; if (nfserr) { /* * nfsd_splice_actor may have already messed with the @@ -3511,8 +3510,9 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, read->rd_vlen = v; len = maxcount; - nfserr = nfsd_readv(file, read->rd_offset, resp->rqstp->rq_vec, - read->rd_vlen, &maxcount); + nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset, + resp->rqstp->rq_vec, read->rd_vlen, &maxcount); + read->rd_length = maxcount; if (nfserr) return nfserr; xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3)); @@ -4214,7 +4214,7 @@ nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr, return nfserr; p = xdr_reserve_space(&resp->xdr, 4 + 4); - *p++ = cpu_to_be32(copy->cp_consecutive); + *p++ = xdr_one; /* cr_consecutive */ *p++ = cpu_to_be32(copy->cp_synchronous); return 0; } diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 8aa011820c4a..a008e7634181 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -87,13 +87,23 @@ nfsd_mode_check(struct svc_rqst *rqstp, struct dentry *dentry, return nfserr_inval; } +static bool nfsd_originating_port_ok(struct svc_rqst *rqstp, int flags) +{ + if (flags & NFSEXP_INSECURE_PORT) + return true; + /* We don't require gss requests to use low ports: */ + if (rqstp->rq_cred.cr_flavor >= RPC_AUTH_GSS) + return true; + return test_bit(RQ_SECURE, &rqstp->rq_flags); +} + static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp, struct svc_export *exp) { int flags = nfsexp_flags(rqstp, exp); /* Check if the request originated from a secure port. */ - if (!test_bit(RQ_SECURE, &rqstp->rq_flags) && !(flags & NFSEXP_INSECURE_PORT)) { + if (!nfsd_originating_port_ok(rqstp, flags)) { RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); dprintk("nfsd: request from insecure port %s!\n", svc_print_addr(rqstp, buf, sizeof(buf))); diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 43c0419b8ddb..f107f9fa8e15 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -212,13 +212,18 @@ nfsd_proc_write(struct svc_rqst *rqstp) struct nfsd_attrstat *resp = rqstp->rq_resp; __be32 nfserr; unsigned long cnt = argp->len; + unsigned int nvecs; dprintk("nfsd: WRITE %s %d bytes at %d\n", SVCFH_fmt(&argp->fh), argp->len, argp->offset); - nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), argp->offset, - rqstp->rq_vec, argp->vlen, &cnt, NFS_DATA_SYNC); + nvecs = svc_fill_write_vector(rqstp, &argp->first, cnt); + if (!nvecs) + return nfserr_io; + nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), + argp->offset, rqstp->rq_vec, nvecs, + &cnt, NFS_DATA_SYNC); return nfsd_return_attrs(nfserr, resp); } @@ -444,17 +449,19 @@ nfsd_proc_symlink(struct svc_rqst *rqstp) struct svc_fh newfh; __be32 nfserr; + if (argp->tlen > NFS_MAXPATHLEN) + return nfserr_nametoolong; + + argp->tname = svc_fill_symlink_pathname(rqstp, &argp->first, + argp->tlen); + if (IS_ERR(argp->tname)) + return nfserrno(PTR_ERR(argp->tname)); + dprintk("nfsd: SYMLINK %s %.*s -> %.*s\n", SVCFH_fmt(&argp->ffh), argp->flen, argp->fname, argp->tlen, argp->tname); fh_init(&newfh, NFS_FHSIZE); - /* - * Crazy hack: the request fits in a page, and already-decoded - * attributes follow argp->tname, so it's safe to just write a - * null to ensure it's null-terminated: - */ - argp->tname[argp->tlen] = '\0'; nfserr = nfsd_symlink(rqstp, &argp->ffh, argp->fname, argp->flen, argp->tname, &newfh); diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 79b6064f8977..a43e8260520a 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -71,22 +71,6 @@ decode_filename(__be32 *p, char **namp, unsigned int *lenp) } static __be32 * -decode_pathname(__be32 *p, char **namp, unsigned int *lenp) -{ - char *name; - unsigned int i; - - if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXPATHLEN)) != NULL) { - for (i = 0, name = *namp; i < *lenp; i++, name++) { - if (*name == '\0') - return NULL; - } - } - - return p; -} - -static __be32 * decode_sattr(__be32 *p, struct iattr *iap) { u32 tmp, tmp1; @@ -287,7 +271,6 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p) struct nfsd_writeargs *args = rqstp->rq_argp; unsigned int len, hdr, dlen; struct kvec *head = rqstp->rq_arg.head; - int v; p = decode_fh(p, &args->fh); if (!p) @@ -323,17 +306,8 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p) if (dlen < XDR_QUADLEN(len)*4) return 0; - rqstp->rq_vec[0].iov_base = (void*)p; - rqstp->rq_vec[0].iov_len = head->iov_len - hdr; - v = 0; - while (len > rqstp->rq_vec[v].iov_len) { - len -= rqstp->rq_vec[v].iov_len; - v++; - rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_pages[v]); - rqstp->rq_vec[v].iov_len = PAGE_SIZE; - } - rqstp->rq_vec[v].iov_len = len; - args->vlen = v + 1; + args->first.iov_base = (void *)p; + args->first.iov_len = head->iov_len - hdr; return 1; } @@ -394,14 +368,39 @@ int nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p) { struct nfsd_symlinkargs *args = rqstp->rq_argp; + char *base = (char *)p; + size_t xdrlen; if ( !(p = decode_fh(p, &args->ffh)) - || !(p = decode_filename(p, &args->fname, &args->flen)) - || !(p = decode_pathname(p, &args->tname, &args->tlen))) + || !(p = decode_filename(p, &args->fname, &args->flen))) return 0; - p = decode_sattr(p, &args->attrs); - return xdr_argsize_check(rqstp, p); + args->tlen = ntohl(*p++); + if (args->tlen == 0) + return 0; + + args->first.iov_base = p; + args->first.iov_len = rqstp->rq_arg.head[0].iov_len; + args->first.iov_len -= (char *)p - base; + + /* This request is never larger than a page. Therefore, + * transport will deliver either: + * 1. pathname in the pagelist -> sattr is in the tail. + * 2. everything in the head buffer -> sattr is in the head. + */ + if (rqstp->rq_arg.page_len) { + if (args->tlen != rqstp->rq_arg.page_len) + return 0; + p = rqstp->rq_arg.tail[0].iov_base; + } else { + xdrlen = XDR_QUADLEN(args->tlen); + if (xdrlen > args->first.iov_len - (8 * sizeof(__be32))) + return 0; + p += xdrlen; + } + decode_sattr(p, &args->attrs); + + return 1; } int diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 8b2f1d92c579..80933e4334d8 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -11,39 +11,79 @@ #include <linux/tracepoint.h> #include "nfsfh.h" +TRACE_EVENT(nfsd_compound, + TP_PROTO(const struct svc_rqst *rqst, + u32 args_opcnt), + TP_ARGS(rqst, args_opcnt), + TP_STRUCT__entry( + __field(u32, xid) + __field(u32, args_opcnt) + ), + TP_fast_assign( + __entry->xid = be32_to_cpu(rqst->rq_xid); + __entry->args_opcnt = args_opcnt; + ), + TP_printk("xid=0x%08x opcnt=%u", + __entry->xid, __entry->args_opcnt) +) + +TRACE_EVENT(nfsd_compound_status, + TP_PROTO(u32 args_opcnt, + u32 resp_opcnt, + __be32 status, + const char *name), + TP_ARGS(args_opcnt, resp_opcnt, status, name), + TP_STRUCT__entry( + __field(u32, args_opcnt) + __field(u32, resp_opcnt) + __field(int, status) + __string(name, name) + ), + TP_fast_assign( + __entry->args_opcnt = args_opcnt; + __entry->resp_opcnt = resp_opcnt; + __entry->status = be32_to_cpu(status); + __assign_str(name, name); + ), + TP_printk("op=%u/%u %s status=%d", + __entry->resp_opcnt, __entry->args_opcnt, + __get_str(name), __entry->status) +) + DECLARE_EVENT_CLASS(nfsd_io_class, TP_PROTO(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, - int len), + unsigned long len), TP_ARGS(rqstp, fhp, offset, len), TP_STRUCT__entry( - __field(__be32, xid) - __field_struct(struct knfsd_fh, fh) + __field(u32, xid) + __field(u32, fh_hash) __field(loff_t, offset) - __field(int, len) + __field(unsigned long, len) ), TP_fast_assign( - __entry->xid = rqstp->rq_xid, - fh_copy_shallow(&__entry->fh, &fhp->fh_handle); + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); __entry->offset = offset; __entry->len = len; ), - TP_printk("xid=0x%x fh=0x%x offset=%lld len=%d", - __be32_to_cpu(__entry->xid), knfsd_fh_hash(&__entry->fh), + TP_printk("xid=0x%08x fh_hash=0x%08x offset=%lld len=%lu", + __entry->xid, __entry->fh_hash, __entry->offset, __entry->len) ) #define DEFINE_NFSD_IO_EVENT(name) \ -DEFINE_EVENT(nfsd_io_class, name, \ +DEFINE_EVENT(nfsd_io_class, nfsd_##name, \ TP_PROTO(struct svc_rqst *rqstp, \ struct svc_fh *fhp, \ loff_t offset, \ - int len), \ + unsigned long len), \ TP_ARGS(rqstp, fhp, offset, len)) DEFINE_NFSD_IO_EVENT(read_start); -DEFINE_NFSD_IO_EVENT(read_opened); +DEFINE_NFSD_IO_EVENT(read_splice); +DEFINE_NFSD_IO_EVENT(read_vector); DEFINE_NFSD_IO_EVENT(read_io_done); DEFINE_NFSD_IO_EVENT(read_done); DEFINE_NFSD_IO_EVENT(write_start); @@ -51,6 +91,40 @@ DEFINE_NFSD_IO_EVENT(write_opened); DEFINE_NFSD_IO_EVENT(write_io_done); DEFINE_NFSD_IO_EVENT(write_done); +DECLARE_EVENT_CLASS(nfsd_err_class, + TP_PROTO(struct svc_rqst *rqstp, + struct svc_fh *fhp, + loff_t offset, + int status), + TP_ARGS(rqstp, fhp, offset, status), + TP_STRUCT__entry( + __field(u32, xid) + __field(u32, fh_hash) + __field(loff_t, offset) + __field(int, status) + ), + TP_fast_assign( + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __entry->offset = offset; + __entry->status = status; + ), + TP_printk("xid=0x%08x fh_hash=0x%08x offset=%lld status=%d", + __entry->xid, __entry->fh_hash, + __entry->offset, __entry->status) +) + +#define DEFINE_NFSD_ERR_EVENT(name) \ +DEFINE_EVENT(nfsd_err_class, nfsd_##name, \ + TP_PROTO(struct svc_rqst *rqstp, \ + struct svc_fh *fhp, \ + loff_t offset, \ + int len), \ + TP_ARGS(rqstp, fhp, offset, len)) + +DEFINE_NFSD_ERR_EVENT(read_err); +DEFINE_NFSD_ERR_EVENT(write_err); + #include "state.h" DECLARE_EVENT_CLASS(nfsd_stateid_class, @@ -76,7 +150,7 @@ DECLARE_EVENT_CLASS(nfsd_stateid_class, ) #define DEFINE_STATEID_EVENT(name) \ -DEFINE_EVENT(nfsd_stateid_class, name, \ +DEFINE_EVENT(nfsd_stateid_class, nfsd_##name, \ TP_PROTO(stateid_t *stp), \ TP_ARGS(stp)) DEFINE_STATEID_EVENT(layoutstate_alloc); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index a3c9bfa77def..2410b093a2e6 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -881,20 +881,24 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe, return __splice_from_pipe(pipe, sd, nfsd_splice_actor); } -static __be32 -nfsd_finish_read(struct file *file, unsigned long *count, int host_err) +static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct file *file, loff_t offset, + unsigned long *count, int host_err) { if (host_err >= 0) { nfsdstats.io_read += host_err; *count = host_err; fsnotify_access(file); + trace_nfsd_read_io_done(rqstp, fhp, offset, *count); return 0; - } else + } else { + trace_nfsd_read_err(rqstp, fhp, offset, host_err); return nfserrno(host_err); + } } -__be32 nfsd_splice_read(struct svc_rqst *rqstp, - struct file *file, loff_t offset, unsigned long *count) +__be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct file *file, loff_t offset, unsigned long *count) { struct splice_desc sd = { .len = 0, @@ -904,21 +908,23 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, }; int host_err; + trace_nfsd_read_splice(rqstp, fhp, offset, *count); rqstp->rq_next_page = rqstp->rq_respages + 1; host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); - return nfsd_finish_read(file, count, host_err); + return nfsd_finish_read(rqstp, fhp, file, offset, count, host_err); } -__be32 nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen, - unsigned long *count) +__be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct file *file, loff_t offset, + struct kvec *vec, int vlen, unsigned long *count) { struct iov_iter iter; int host_err; + trace_nfsd_read_vector(rqstp, fhp, offset, *count); iov_iter_kvec(&iter, READ | ITER_KVEC, vec, vlen, *count); host_err = vfs_iter_read(file, &iter, &offset, 0); - - return nfsd_finish_read(file, count, host_err); + return nfsd_finish_read(rqstp, fhp, file, offset, count, host_err); } /* @@ -965,13 +971,15 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, { struct svc_export *exp; struct iov_iter iter; - __be32 err = 0; + __be32 nfserr; int host_err; int use_wgather; loff_t pos = offset; unsigned int pflags = current->flags; rwf_t flags = 0; + trace_nfsd_write_opened(rqstp, fhp, offset, *cnt); + if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) /* * We want less throttling in balance_dirty_pages() @@ -994,22 +1002,23 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, host_err = vfs_iter_write(file, &iter, &pos, flags); if (host_err < 0) goto out_nfserr; - *cnt = host_err; - nfsdstats.io_write += host_err; + nfsdstats.io_write += *cnt; fsnotify_modify(file); if (stable && use_wgather) host_err = wait_for_concurrent_writes(file); out_nfserr: - dprintk("nfsd: write complete host_err=%d\n", host_err); - if (host_err >= 0) - err = 0; - else - err = nfserrno(host_err); + if (host_err >= 0) { + trace_nfsd_write_io_done(rqstp, fhp, offset, *cnt); + nfserr = nfs_ok; + } else { + trace_nfsd_write_err(rqstp, fhp, offset, host_err); + nfserr = nfserrno(host_err); + } if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) current_restore_flags(pflags, PF_LESS_THROTTLE); - return err; + return nfserr; } /* @@ -1024,27 +1033,23 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct raparms *ra; __be32 err; - trace_read_start(rqstp, fhp, offset, vlen); + trace_nfsd_read_start(rqstp, fhp, offset, *count); err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); if (err) return err; ra = nfsd_init_raparms(file); - trace_read_opened(rqstp, fhp, offset, vlen); - if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags)) - err = nfsd_splice_read(rqstp, file, offset, count); + err = nfsd_splice_read(rqstp, fhp, file, offset, count); else - err = nfsd_readv(file, offset, vec, vlen, count); - - trace_read_io_done(rqstp, fhp, offset, vlen); + err = nfsd_readv(rqstp, fhp, file, offset, vec, vlen, count); if (ra) nfsd_put_raparams(file, ra); fput(file); - trace_read_done(rqstp, fhp, offset, vlen); + trace_nfsd_read_done(rqstp, fhp, offset, *count); return err; } @@ -1061,18 +1066,16 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, struct file *file = NULL; __be32 err = 0; - trace_write_start(rqstp, fhp, offset, vlen); + trace_nfsd_write_start(rqstp, fhp, offset, *cnt); err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file); if (err) goto out; - trace_write_opened(rqstp, fhp, offset, vlen); err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, stable); - trace_write_io_done(rqstp, fhp, offset, vlen); fput(file); out: - trace_write_done(rqstp, fhp, offset, vlen); + trace_nfsd_write_done(rqstp, fhp, offset, *cnt); return err; } diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index be6d8e00453f..a7e107309f76 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -78,10 +78,13 @@ __be32 nfsd_commit(struct svc_rqst *, struct svc_fh *, __be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t, int, struct file **); struct raparms; -__be32 nfsd_splice_read(struct svc_rqst *, - struct file *, loff_t, unsigned long *); -__be32 nfsd_readv(struct file *, loff_t, struct kvec *, int, - unsigned long *); +__be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct file *file, loff_t offset, + unsigned long *count); +__be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct file *file, loff_t offset, + struct kvec *vec, int vlen, + unsigned long *count); __be32 nfsd_read(struct svc_rqst *, struct svc_fh *, loff_t, struct kvec *, int, unsigned long *); __be32 nfsd_write(struct svc_rqst *, struct svc_fh *, loff_t, diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h index 2f4f22e6b8cb..ea7cca3a64b7 100644 --- a/fs/nfsd/xdr.h +++ b/fs/nfsd/xdr.h @@ -34,7 +34,7 @@ struct nfsd_writeargs { svc_fh fh; __u32 offset; int len; - int vlen; + struct kvec first; }; struct nfsd_createargs { @@ -72,6 +72,7 @@ struct nfsd_symlinkargs { char * tname; unsigned int tlen; struct iattr attrs; + struct kvec first; }; struct nfsd_readdirargs { diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index 056bf8a7364e..2cb29e961a76 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -41,7 +41,7 @@ struct nfsd3_writeargs { __u32 count; int stable; __u32 len; - int vlen; + struct kvec first; }; struct nfsd3_createargs { @@ -90,6 +90,7 @@ struct nfsd3_symlinkargs { char * tname; unsigned int tlen; struct iattr attrs; + struct kvec first; }; struct nfsd3_readdirargs { diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index bc29511b6405..17c453a7999c 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -110,6 +110,7 @@ struct nfsd4_create { struct { u32 datalen; char *data; + struct kvec first; } link; /* NF4LNK */ struct { u32 specdata1; @@ -118,12 +119,14 @@ struct nfsd4_create { } u; u32 cr_bmval[3]; /* request */ struct iattr cr_iattr; /* request */ + int cr_umask; /* request */ struct nfsd4_change_info cr_cinfo; /* response */ struct nfs4_acl *cr_acl; struct xdr_netobj cr_label; }; #define cr_datalen u.link.datalen #define cr_data u.link.data +#define cr_first u.link.first #define cr_specdata1 u.dev.specdata1 #define cr_specdata2 u.dev.specdata2 @@ -228,6 +231,7 @@ struct nfsd4_open { u32 op_why_no_deleg; /* response - DELEG_NONE_EXT only */ u32 op_create; /* request */ u32 op_createmode; /* request */ + int op_umask; /* request */ u32 op_bmval[3]; /* request */ struct iattr op_iattr; /* UNCHECKED4, GUARDED4, EXCLUSIVE4_1 */ nfs4_verifier op_verf __attribute__((aligned(32))); @@ -518,7 +522,6 @@ struct nfsd4_copy { u64 cp_count; /* both */ - bool cp_consecutive; bool cp_synchronous; /* response */ diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index c21e0b4454a6..dec98cab729d 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -193,9 +193,9 @@ retry: (unsigned long long)oldkey, (unsigned long long)newkey); - spin_lock_irq(&btnc->tree_lock); - err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page); - spin_unlock_irq(&btnc->tree_lock); + xa_lock_irq(&btnc->i_pages); + err = radix_tree_insert(&btnc->i_pages, newkey, obh->b_page); + xa_unlock_irq(&btnc->i_pages); /* * Note: page->index will not change to newkey until * nilfs_btnode_commit_change_key() will be called. @@ -251,11 +251,11 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc, (unsigned long long)newkey); mark_buffer_dirty(obh); - spin_lock_irq(&btnc->tree_lock); - radix_tree_delete(&btnc->page_tree, oldkey); - radix_tree_tag_set(&btnc->page_tree, newkey, + xa_lock_irq(&btnc->i_pages); + radix_tree_delete(&btnc->i_pages, oldkey); + radix_tree_tag_set(&btnc->i_pages, newkey, PAGECACHE_TAG_DIRTY); - spin_unlock_irq(&btnc->tree_lock); + xa_unlock_irq(&btnc->i_pages); opage->index = obh->b_blocknr = newkey; unlock_page(opage); @@ -283,9 +283,9 @@ void nilfs_btnode_abort_change_key(struct address_space *btnc, return; if (nbh == NULL) { /* blocksize == pagesize */ - spin_lock_irq(&btnc->tree_lock); - radix_tree_delete(&btnc->page_tree, newkey); - spin_unlock_irq(&btnc->tree_lock); + xa_lock_irq(&btnc->i_pages); + radix_tree_delete(&btnc->i_pages, newkey); + xa_unlock_irq(&btnc->i_pages); unlock_page(ctxt->bh->b_page); } else brelse(nbh); diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 68241512d7c1..4cb850a6f1c2 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -331,15 +331,15 @@ repeat: struct page *page2; /* move the page to the destination cache */ - spin_lock_irq(&smap->tree_lock); - page2 = radix_tree_delete(&smap->page_tree, offset); + xa_lock_irq(&smap->i_pages); + page2 = radix_tree_delete(&smap->i_pages, offset); WARN_ON(page2 != page); smap->nrpages--; - spin_unlock_irq(&smap->tree_lock); + xa_unlock_irq(&smap->i_pages); - spin_lock_irq(&dmap->tree_lock); - err = radix_tree_insert(&dmap->page_tree, offset, page); + xa_lock_irq(&dmap->i_pages); + err = radix_tree_insert(&dmap->i_pages, offset, page); if (unlikely(err < 0)) { WARN_ON(err == -EEXIST); page->mapping = NULL; @@ -348,11 +348,11 @@ repeat: page->mapping = dmap; dmap->nrpages++; if (PageDirty(page)) - radix_tree_tag_set(&dmap->page_tree, + radix_tree_tag_set(&dmap->i_pages, offset, PAGECACHE_TAG_DIRTY); } - spin_unlock_irq(&dmap->tree_lock); + xa_unlock_irq(&dmap->i_pages); } unlock_page(page); } @@ -474,15 +474,15 @@ int __nilfs_clear_page_dirty(struct page *page) struct address_space *mapping = page->mapping; if (mapping) { - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); if (test_bit(PG_dirty, &page->flags)) { - radix_tree_tag_clear(&mapping->page_tree, + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return clear_page_dirty_for_io(page); } - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return 0; } return TestClearPageDirty(page); diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 6702a6a0bbb5..d51e1bb781cf 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -139,23 +139,32 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, return false; } -struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask, +struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group, + struct inode *inode, u32 mask, const struct path *path) { struct fanotify_event_info *event; + gfp_t gfp = GFP_KERNEL; + + /* + * For queues with unlimited length lost events are not expected and + * can possibly have security implications. Avoid losing events when + * memory is short. + */ + if (group->max_events == UINT_MAX) + gfp |= __GFP_NOFAIL; if (fanotify_is_perm_event(mask)) { struct fanotify_perm_event_info *pevent; - pevent = kmem_cache_alloc(fanotify_perm_event_cachep, - GFP_KERNEL); + pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp); if (!pevent) return NULL; event = &pevent->fae; pevent->response = 0; goto init; } - event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL); + event = kmem_cache_alloc(fanotify_event_cachep, gfp); if (!event) return NULL; init: __maybe_unused @@ -210,10 +219,17 @@ static int fanotify_handle_event(struct fsnotify_group *group, return 0; } - event = fanotify_alloc_event(inode, mask, data); + event = fanotify_alloc_event(group, inode, mask, data); ret = -ENOMEM; - if (unlikely(!event)) + if (unlikely(!event)) { + /* + * We don't queue overflow events for permission events as + * there the access is denied and so no event is in fact lost. + */ + if (!fanotify_is_perm_event(mask)) + fsnotify_queue_overflow(group); goto finish; + } fsn_event = &event->fse; ret = fsnotify_add_event(group, fsn_event, fanotify_merge); diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 256d9d1ddea9..8609ba06f474 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -52,5 +52,6 @@ static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse) return container_of(fse, struct fanotify_event_info, fse); } -struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask, +struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group, + struct inode *inode, u32 mask, const struct path *path); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index fa803a58a605..ec4d8c59d0e3 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -757,7 +757,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) group->fanotify_data.user = user; atomic_inc(&user->fanotify_listeners); - oevent = fanotify_alloc_event(NULL, FS_Q_OVERFLOW, NULL); + oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL); if (unlikely(!oevent)) { fd = -ENOMEM; goto out_destroy_group; diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 8b73332735ba..40dedb37a1f3 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -99,8 +99,14 @@ int inotify_handle_event(struct fsnotify_group *group, fsn_mark); event = kmalloc(alloc_len, GFP_KERNEL); - if (unlikely(!event)) + if (unlikely(!event)) { + /* + * Treat lost event due to ENOMEM the same way as queue + * overflow to let userspace know event was lost. + */ + fsnotify_queue_overflow(group); return -ENOMEM; + } fsn_event = &event->fse; fsnotify_init_event(fsn_event, inode, mask); diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 43c23653ce2e..ef32f3657958 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -307,6 +307,20 @@ static long inotify_ioctl(struct file *file, unsigned int cmd, spin_unlock(&group->notification_lock); ret = put_user(send_len, (int __user *) p); break; +#ifdef CONFIG_CHECKPOINT_RESTORE + case INOTIFY_IOC_SETNEXTWD: + ret = -EINVAL; + if (arg >= 1 && arg <= INT_MAX) { + struct inotify_group_private_data *data; + + data = &group->inotify_data; + spin_lock(&data->idr_lock); + idr_set_cursor(&data->idr, (unsigned int)arg); + spin_unlock(&data->idr_lock); + ret = 0; + } + break; +#endif /* CONFIG_CHECKPOINT_RESTORE */ } return ret; diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 66f85c651c52..3c3e36745f59 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -111,7 +111,8 @@ int fsnotify_add_event(struct fsnotify_group *group, return 2; } - if (group->q_len >= group->max_events) { + if (event == group->overflow_event || + group->q_len >= group->max_events) { ret = 2; /* Queue overflow event only if it isn't already queued */ if (!list_empty(&group->overflow_event->list)) { diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 2831f495a674..32c523cf5a2d 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -381,7 +381,7 @@ unm_err_out: * vfs inode dirty. This ensures that any changes to the mft record are * written out to disk. * - * NOTE: We only set I_DIRTY_SYNC and I_DIRTY_DATASYNC (and not I_DIRTY_PAGES) + * NOTE: We only set I_DIRTY_DATASYNC (and not I_DIRTY_PAGES) * on the base vfs inode, because even though file data may have been modified, * it is dirty in the inode meta data rather than the data page cache of the * inode, and thus there are no data pages that need writing out. Therefore, a @@ -407,7 +407,7 @@ void __mark_mft_record_dirty(ntfs_inode *ni) else base_ni = ni->ext.base_ntfs_ino; mutex_unlock(&ni->extent_lock); - __mark_inode_dirty(VFS_I(base_ni), I_DIRTY_SYNC | I_DIRTY_DATASYNC); + __mark_inode_dirty(VFS_I(base_ni), I_DIRTY_DATASYNC); } static const char *ntfs_please_email = "Please email " diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 9a876bb07cac..0f157bbd3e0f 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -7119,7 +7119,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, goto out_commit; did_quota = 1; - data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; + data_ac->ac_resv = &oi->ip_la_data_resv; ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &num); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index e8e205bf2e41..302cd7caa4a7 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -346,7 +346,7 @@ static int ocfs2_readpage(struct file *file, struct page *page) unlock = 0; out_alloc: - up_read(&OCFS2_I(inode)->ip_alloc_sem); + up_read(&oi->ip_alloc_sem); out_inode_unlock: ocfs2_inode_unlock(inode, 0); out: @@ -2213,7 +2213,7 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock, down_write(&oi->ip_alloc_sem); if (first_get_block) { - if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) + if (ocfs2_sparse_alloc(osb)) ret = ocfs2_zero_tail(inode, di_bh, pos); else ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 8614ff069d99..3494a62ed749 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -78,7 +78,7 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level) /* * Using a named enum representing lock types in terms of #N bit stored in * iocb->private, which is going to be used for communication between - * ocfs2_dio_end_io() and ocfs2_file_aio_write/read(). + * ocfs2_dio_end_io() and ocfs2_file_write/read_iter(). */ enum ocfs2_iocb_lock_bits { OCFS2_IOCB_RW_LOCK = 0, diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index ea8c551bcd7e..91a8889abf9b 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -570,7 +570,16 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, current_page, vec_len, vec_start); len = bio_add_page(bio, page, vec_len, vec_start); - if (len != vec_len) break; + if (len != vec_len) { + mlog(ML_ERROR, "Adding page[%d] to bio failed, " + "page %p, len %d, vec_len %u, vec_start %u, " + "bi_sector %llu\n", current_page, page, len, + vec_len, vec_start, + (unsigned long long)bio->bi_iter.bi_sector); + bio_put(bio); + bio = ERR_PTR(-EIO); + return bio; + } cs += vec_len / (PAGE_SIZE/spp); vec_start = 0; diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 977763d4c27d..b048d4fa3959 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -3072,7 +3072,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, * We need to return the correct block within the * cluster which should hold our entry. */ - off = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), + off = ocfs2_dx_dir_hash_idx(osb, &lookup->dl_hinfo); get_bh(dx_leaves[off]); lookup->dl_dx_leaf_bh = dx_leaves[off]; diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index fd6bbbbd7d78..39831fc2fd52 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -224,14 +224,12 @@ void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, struct dlm_lock *lock) { dlm_astlockfunc_t *fn; - struct dlm_lockstatus *lksb; mlog(0, "%s: res %.*s, lock %u:%llu, Local AST\n", dlm->name, res->lockname.len, res->lockname.name, dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie))); - lksb = lock->lksb; fn = lock->ast; BUG_ON(lock->ml.node != dlm->node_num); diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index e9f3705c4c9f..d06e27ec4be4 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -140,6 +140,7 @@ struct dlm_ctxt u8 node_num; u32 key; u8 joining_node; + u8 migrate_done; /* set to 1 means node has migrated all lock resources */ wait_queue_head_t dlm_join_events; unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; @@ -960,13 +961,10 @@ static inline int dlm_send_proxy_ast(struct dlm_ctxt *dlm, void dlm_print_one_lock_resource(struct dlm_lock_resource *res); void __dlm_print_one_lock_resource(struct dlm_lock_resource *res); -u8 dlm_nm_this_node(struct dlm_ctxt *dlm); void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); -int dlm_nm_init(struct dlm_ctxt *dlm); -int dlm_heartbeat_init(struct dlm_ctxt *dlm); void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data); void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data); diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index e1fea149f50b..425081be6161 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -461,6 +461,19 @@ redo_bucket: cond_resched_lock(&dlm->spinlock); num += n; } + + if (!num) { + if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) { + mlog(0, "%s: perhaps there are more lock resources " + "need to be migrated after dlm recovery\n", dlm->name); + ret = -EAGAIN; + } else { + mlog(0, "%s: we won't do dlm recovery after migrating " + "all lock resources\n", dlm->name); + dlm->migrate_done = 1; + } + } + spin_unlock(&dlm->spinlock); wake_up(&dlm->dlm_thread_wq); @@ -675,20 +688,6 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm) spin_unlock(&dlm->spinlock); } -int dlm_shutting_down(struct dlm_ctxt *dlm) -{ - int ret = 0; - - spin_lock(&dlm_domain_lock); - - if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN) - ret = 1; - - spin_unlock(&dlm_domain_lock); - - return ret; -} - void dlm_unregister_domain(struct dlm_ctxt *dlm) { int leave = 0; @@ -2052,6 +2051,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN; init_waitqueue_head(&dlm->dlm_join_events); + dlm->migrate_done = 0; + dlm->reco.new_master = O2NM_INVALID_NODE_NUM; dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; diff --git a/fs/ocfs2/dlm/dlmdomain.h b/fs/ocfs2/dlm/dlmdomain.h index fd6122a38dbd..8a9281411c18 100644 --- a/fs/ocfs2/dlm/dlmdomain.h +++ b/fs/ocfs2/dlm/dlmdomain.h @@ -28,7 +28,30 @@ extern spinlock_t dlm_domain_lock; extern struct list_head dlm_domains; -int dlm_shutting_down(struct dlm_ctxt *dlm); +static inline int dlm_joined(struct dlm_ctxt *dlm) +{ + int ret = 0; + + spin_lock(&dlm_domain_lock); + if (dlm->dlm_state == DLM_CTXT_JOINED) + ret = 1; + spin_unlock(&dlm_domain_lock); + + return ret; +} + +static inline int dlm_shutting_down(struct dlm_ctxt *dlm) +{ + int ret = 0; + + spin_lock(&dlm_domain_lock); + if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN) + ret = 1; + spin_unlock(&dlm_domain_lock); + + return ret; +} + void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm, int node_num); diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 66c2a491f68d..74962315794e 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -77,8 +77,7 @@ int dlm_init_lock_cache(void) void dlm_destroy_lock_cache(void) { - if (dlm_lock_cache) - kmem_cache_destroy(dlm_lock_cache); + kmem_cache_destroy(dlm_lock_cache); } /* Tell us whether we can grant a new lock request. diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index a7df226f9449..aaca0949fe53 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -414,8 +414,7 @@ int dlm_init_mle_cache(void) void dlm_destroy_mle_cache(void) { - if (dlm_mle_cache) - kmem_cache_destroy(dlm_mle_cache); + kmem_cache_destroy(dlm_mle_cache); } static void dlm_mle_release(struct kref *kref) @@ -472,15 +471,11 @@ bail: void dlm_destroy_master_caches(void) { - if (dlm_lockname_cache) { - kmem_cache_destroy(dlm_lockname_cache); - dlm_lockname_cache = NULL; - } + kmem_cache_destroy(dlm_lockname_cache); + dlm_lockname_cache = NULL; - if (dlm_lockres_cache) { - kmem_cache_destroy(dlm_lockres_cache); - dlm_lockres_cache = NULL; - } + kmem_cache_destroy(dlm_lockres_cache); + dlm_lockres_cache = NULL; } static void dlm_lockres_release(struct kref *kref) @@ -2495,13 +2490,13 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) } /* - * A migrateable resource is one that is : + * A migratable resource is one that is : * 1. locally mastered, and, * 2. zero local locks, and, * 3. one or more non-local locks, or, one or more references * Returns 1 if yes, 0 if not. */ -static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, +static int dlm_is_lockres_migratable(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { enum dlm_lockres_list idx; @@ -2532,7 +2527,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, continue; } cookie = be64_to_cpu(lock->ml.cookie); - mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on " + mlog(0, "%s: Not migratable res %.*s, lock %u:%llu on " "%s list\n", dlm->name, res->lockname.len, res->lockname.name, dlm_get_lock_cookie_node(cookie), @@ -2548,7 +2543,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, return 0; } - mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len, + mlog(0, "%s: res %.*s, Migratable\n", dlm->name, res->lockname.len, res->lockname.name); return 1; @@ -2792,7 +2787,7 @@ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) assert_spin_locked(&dlm->spinlock); spin_lock(&res->spinlock); - if (dlm_is_lockres_migrateable(dlm, res)) + if (dlm_is_lockres_migratable(dlm, res)) target = dlm_pick_migration_target(dlm, res); spin_unlock(&res->spinlock); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index ec8f75813beb..802636d50365 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -62,7 +62,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node); static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node); static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from, u8 dead_node); -static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node); +static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm); static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res); static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres, @@ -423,12 +423,11 @@ void dlm_wait_for_recovery(struct dlm_ctxt *dlm) static void dlm_begin_recovery(struct dlm_ctxt *dlm) { - spin_lock(&dlm->spinlock); + assert_spin_locked(&dlm->spinlock); BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE); printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n", dlm->name, dlm->reco.dead_node); dlm->reco.state |= DLM_RECO_STATE_ACTIVE; - spin_unlock(&dlm->spinlock); } static void dlm_end_recovery(struct dlm_ctxt *dlm) @@ -456,6 +455,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) spin_lock(&dlm->spinlock); + if (dlm->migrate_done) { + mlog(0, "%s: no need do recovery after migrating all " + "lock resources\n", dlm->name); + spin_unlock(&dlm->spinlock); + return 0; + } + /* check to see if the new master has died */ if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM && test_bit(dlm->reco.new_master, dlm->recovery_map)) { @@ -490,12 +496,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n", dlm->name, task_pid_nr(dlm->dlm_reco_thread_task), dlm->reco.dead_node); - spin_unlock(&dlm->spinlock); /* take write barrier */ /* (stops the list reshuffling thread, proxy ast handling) */ dlm_begin_recovery(dlm); + spin_unlock(&dlm->spinlock); + if (dlm->reco.new_master == dlm->node_num) goto master_here; @@ -739,7 +746,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) } if (destroy) - dlm_destroy_recovery_area(dlm, dead_node); + dlm_destroy_recovery_area(dlm); return status; } @@ -764,7 +771,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) ndata = kzalloc(sizeof(*ndata), GFP_NOFS); if (!ndata) { - dlm_destroy_recovery_area(dlm, dead_node); + dlm_destroy_recovery_area(dlm); return -ENOMEM; } ndata->node_num = num; @@ -778,7 +785,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) return 0; } -static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) +static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm) { struct dlm_reco_node_data *ndata, *next; LIST_HEAD(tmplist); @@ -1378,6 +1385,15 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, if (!dlm_grab(dlm)) return -EINVAL; + if (!dlm_joined(dlm)) { + mlog(ML_ERROR, "Domain %s not joined! " + "lockres %.*s, master %u\n", + dlm->name, mres->lockname_len, + mres->lockname, mres->master); + dlm_put(dlm); + return -EINVAL; + } + BUG_ON(!(mres->flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION))); real_master = mres->master; @@ -1807,7 +1823,6 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, int i, j, bad; struct dlm_lock *lock; u8 from = O2NM_MAX_NODES; - unsigned int added = 0; __be64 c; mlog(0, "running %d locks for this lockres\n", mres->num_locks); @@ -1823,7 +1838,6 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, spin_lock(&res->spinlock); dlm_lockres_set_refmap_bit(dlm, res, from); spin_unlock(&res->spinlock); - added++; break; } BUG_ON(ml->highest_blocked != LKM_IVMODE); @@ -1911,7 +1925,6 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, /* do not alter lock refcount. switching lists. */ list_move_tail(&lock->list, queue); spin_unlock(&res->spinlock); - added++; mlog(0, "just reordered a local lock!\n"); continue; @@ -2037,7 +2050,6 @@ skip_lvb: "setting refmap bit\n", dlm->name, res->lockname.len, res->lockname.name, ml->node); dlm_lockres_set_refmap_bit(dlm, res, ml->node); - added++; } spin_unlock(&res->spinlock); } @@ -2331,13 +2343,6 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, __dlm_dirty_lockres(dlm, res); } -/* if this node is the recovery master, and there are no - * locks for a given lockres owned by this node that are in - * either PR or EX mode, zero out the lvb before requesting. - * - */ - - static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) { struct dlm_lock_resource *res; diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 9479f99c2145..97a972efab83 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1756,8 +1756,7 @@ int ocfs2_rw_lock(struct inode *inode, int write) level = write ? DLM_LOCK_EX : DLM_LOCK_PR; - status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0, - 0); + status = ocfs2_cluster_lock(osb, lockres, level, 0, 0); if (status < 0) mlog_errno(status); @@ -1796,7 +1795,7 @@ void ocfs2_rw_unlock(struct inode *inode, int write) write ? "EXMODE" : "PRMODE"); if (!ocfs2_mount_local(osb)) - ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); + ocfs2_cluster_unlock(osb, lockres, level); } /* @@ -1816,8 +1815,7 @@ int ocfs2_open_lock(struct inode *inode) lockres = &OCFS2_I(inode)->ip_open_lockres; - status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, - DLM_LOCK_PR, 0, 0); + status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_PR, 0, 0); if (status < 0) mlog_errno(status); @@ -1854,8 +1852,7 @@ int ocfs2_try_open_lock(struct inode *inode, int write) * other nodes and the -EAGAIN will indicate to the caller that * this inode is still in use. */ - status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, - level, DLM_LKF_NOQUEUE, 0); + status = ocfs2_cluster_lock(osb, lockres, level, DLM_LKF_NOQUEUE, 0); out: return status; @@ -1876,11 +1873,9 @@ void ocfs2_open_unlock(struct inode *inode) goto out; if(lockres->l_ro_holders) - ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, - DLM_LOCK_PR); + ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_PR); if(lockres->l_ex_holders) - ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, - DLM_LOCK_EX); + ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX); out: return; @@ -2601,9 +2596,9 @@ void ocfs2_inode_unlock(struct inode *inode, (unsigned long long)OCFS2_I(inode)->ip_blkno, ex ? "EXMODE" : "PRMODE"); - if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) && + if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb)) - ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); + ocfs2_cluster_unlock(osb, lockres, level); } /* @@ -3537,7 +3532,7 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb, * On DLM_LKF_VALBLK, fsdlm behaves differently with o2cb. It always * expects DLM_LKF_VALBLK being set if the LKB has LVB, so that * we can recover correctly from node failure. Otherwise, we may get - * invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set. + * invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set. */ if (!ocfs2_is_o2cb_active() && lockres->l_ops->flags & LOCK_TYPE_USES_LVB) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 5d1784a365a3..6ee94bc23f5b 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -101,7 +101,7 @@ static int ocfs2_file_open(struct inode *inode, struct file *file) struct ocfs2_inode_info *oi = OCFS2_I(inode); trace_ocfs2_file_open(inode, file, file->f_path.dentry, - (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)oi->ip_blkno, file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name, mode); @@ -116,7 +116,7 @@ static int ocfs2_file_open(struct inode *inode, struct file *file) /* Check that the inode hasn't been wiped from disk by another * node. If it hasn't then we're safe as long as we hold the * spin lock until our increment of open count. */ - if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { + if (oi->ip_flags & OCFS2_INODE_DELETED) { spin_unlock(&oi->ip_lock); status = -ENOENT; @@ -190,7 +190,7 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end, bool needs_barrier = false; trace_ocfs2_sync_file(inode, file, file->f_path.dentry, - OCFS2_I(inode)->ip_blkno, + oi->ip_blkno, file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name, (unsigned long long)datasync); @@ -296,7 +296,7 @@ int ocfs2_update_inode_atime(struct inode *inode, ocfs2_journal_dirty(handle, bh); out_commit: - ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); + ocfs2_commit_trans(osb, handle); out: return ret; } @@ -2257,7 +2257,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0; - trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry, + trace_ocfs2_file_write_iter(inode, file, file->f_path.dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno, file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name, @@ -2405,7 +2405,7 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0; - trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry, + trace_ocfs2_file_read_iter(inode, filp, filp->f_path.dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno, filp->f_path.dentry->d_name.len, filp->f_path.dentry->d_name.name, @@ -2448,7 +2448,7 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, * * Take and drop the meta data lock to update inode fields * like i_size. This allows the checks down below - * generic_file_aio_read() a chance of actually working. + * generic_file_read_iter() a chance of actually working. */ ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level, !nowait); @@ -2460,7 +2460,7 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, ocfs2_inode_unlock(inode, lock_level); ret = generic_file_read_iter(iocb, to); - trace_generic_file_aio_read_ret(ret); + trace_generic_file_read_iter_ret(ret); /* buffered aio wouldn't have proper lock coverage today */ BUG_ON(ret == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT)); diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c index 6b92cb241138..f65f2b2f594d 100644 --- a/fs/ocfs2/filecheck.c +++ b/fs/ocfs2/filecheck.c @@ -53,36 +53,6 @@ static const char * const ocfs2_filecheck_errs[] = { "UNSUPPORTED" }; -static DEFINE_SPINLOCK(ocfs2_filecheck_sysfs_lock); -static LIST_HEAD(ocfs2_filecheck_sysfs_list); - -struct ocfs2_filecheck { - struct list_head fc_head; /* File check entry list head */ - spinlock_t fc_lock; - unsigned int fc_max; /* Maximum number of entry in list */ - unsigned int fc_size; /* Current entry count in list */ - unsigned int fc_done; /* Finished entry count in list */ -}; - -struct ocfs2_filecheck_sysfs_entry { /* sysfs entry per mounting */ - struct list_head fs_list; - atomic_t fs_count; - struct super_block *fs_sb; - struct kset *fs_devicekset; - struct kset *fs_fcheckkset; - struct ocfs2_filecheck *fs_fcheck; -}; - -#define OCFS2_FILECHECK_MAXSIZE 100 -#define OCFS2_FILECHECK_MINSIZE 10 - -/* File check operation type */ -enum { - OCFS2_FILECHECK_TYPE_CHK = 0, /* Check a file(inode) */ - OCFS2_FILECHECK_TYPE_FIX, /* Fix a file(inode) */ - OCFS2_FILECHECK_TYPE_SET = 100 /* Set entry list maximum size */ -}; - struct ocfs2_filecheck_entry { struct list_head fe_list; unsigned long fe_ino; @@ -110,35 +80,84 @@ ocfs2_filecheck_error(int errno) return ocfs2_filecheck_errs[errno - OCFS2_FILECHECK_ERR_START + 1]; } -static ssize_t ocfs2_filecheck_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf); -static ssize_t ocfs2_filecheck_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count); -static struct kobj_attribute ocfs2_attr_filecheck_chk = +static ssize_t ocfs2_filecheck_attr_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf); +static ssize_t ocfs2_filecheck_attr_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count); +static struct kobj_attribute ocfs2_filecheck_attr_chk = __ATTR(check, S_IRUSR | S_IWUSR, - ocfs2_filecheck_show, - ocfs2_filecheck_store); -static struct kobj_attribute ocfs2_attr_filecheck_fix = + ocfs2_filecheck_attr_show, + ocfs2_filecheck_attr_store); +static struct kobj_attribute ocfs2_filecheck_attr_fix = __ATTR(fix, S_IRUSR | S_IWUSR, - ocfs2_filecheck_show, - ocfs2_filecheck_store); -static struct kobj_attribute ocfs2_attr_filecheck_set = + ocfs2_filecheck_attr_show, + ocfs2_filecheck_attr_store); +static struct kobj_attribute ocfs2_filecheck_attr_set = __ATTR(set, S_IRUSR | S_IWUSR, - ocfs2_filecheck_show, - ocfs2_filecheck_store); + ocfs2_filecheck_attr_show, + ocfs2_filecheck_attr_store); +static struct attribute *ocfs2_filecheck_attrs[] = { + &ocfs2_filecheck_attr_chk.attr, + &ocfs2_filecheck_attr_fix.attr, + &ocfs2_filecheck_attr_set.attr, + NULL +}; + +static void ocfs2_filecheck_release(struct kobject *kobj) +{ + struct ocfs2_filecheck_sysfs_entry *entry = container_of(kobj, + struct ocfs2_filecheck_sysfs_entry, fs_kobj); + + complete(&entry->fs_kobj_unregister); +} + +static ssize_t +ocfs2_filecheck_show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + ssize_t ret = -EIO; + struct kobj_attribute *kattr = container_of(attr, + struct kobj_attribute, attr); + + kobject_get(kobj); + if (kattr->show) + ret = kattr->show(kobj, kattr, buf); + kobject_put(kobj); + return ret; +} + +static ssize_t +ocfs2_filecheck_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + ssize_t ret = -EIO; + struct kobj_attribute *kattr = container_of(attr, + struct kobj_attribute, attr); + + kobject_get(kobj); + if (kattr->store) + ret = kattr->store(kobj, kattr, buf, count); + kobject_put(kobj); + return ret; +} + +static const struct sysfs_ops ocfs2_filecheck_ops = { + .show = ocfs2_filecheck_show, + .store = ocfs2_filecheck_store, +}; + +static struct kobj_type ocfs2_ktype_filecheck = { + .default_attrs = ocfs2_filecheck_attrs, + .sysfs_ops = &ocfs2_filecheck_ops, + .release = ocfs2_filecheck_release, +}; static void ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry) { struct ocfs2_filecheck_entry *p; - if (!atomic_dec_and_test(&entry->fs_count)) { - wait_var_event(&entry->fs_count, - !atomic_read(&entry->fs_count)); - } - spin_lock(&entry->fs_fcheck->fc_lock); while (!list_empty(&entry->fs_fcheck->fc_head)) { p = list_first_entry(&entry->fs_fcheck->fc_head, @@ -149,151 +168,48 @@ ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry) } spin_unlock(&entry->fs_fcheck->fc_lock); - kset_unregister(entry->fs_fcheckkset); - kset_unregister(entry->fs_devicekset); kfree(entry->fs_fcheck); - kfree(entry); -} - -static void -ocfs2_filecheck_sysfs_add(struct ocfs2_filecheck_sysfs_entry *entry) -{ - spin_lock(&ocfs2_filecheck_sysfs_lock); - list_add_tail(&entry->fs_list, &ocfs2_filecheck_sysfs_list); - spin_unlock(&ocfs2_filecheck_sysfs_lock); + entry->fs_fcheck = NULL; } -static int ocfs2_filecheck_sysfs_del(const char *devname) +int ocfs2_filecheck_create_sysfs(struct ocfs2_super *osb) { - struct ocfs2_filecheck_sysfs_entry *p; - - spin_lock(&ocfs2_filecheck_sysfs_lock); - list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) { - if (!strcmp(p->fs_sb->s_id, devname)) { - list_del(&p->fs_list); - spin_unlock(&ocfs2_filecheck_sysfs_lock); - ocfs2_filecheck_sysfs_free(p); - return 0; - } - } - spin_unlock(&ocfs2_filecheck_sysfs_lock); - return 1; -} - -static void -ocfs2_filecheck_sysfs_put(struct ocfs2_filecheck_sysfs_entry *entry) -{ - if (atomic_dec_and_test(&entry->fs_count)) - wake_up_var(&entry->fs_count); -} - -static struct ocfs2_filecheck_sysfs_entry * -ocfs2_filecheck_sysfs_get(const char *devname) -{ - struct ocfs2_filecheck_sysfs_entry *p = NULL; - - spin_lock(&ocfs2_filecheck_sysfs_lock); - list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) { - if (!strcmp(p->fs_sb->s_id, devname)) { - atomic_inc(&p->fs_count); - spin_unlock(&ocfs2_filecheck_sysfs_lock); - return p; - } - } - spin_unlock(&ocfs2_filecheck_sysfs_lock); - return NULL; -} - -int ocfs2_filecheck_create_sysfs(struct super_block *sb) -{ - int ret = 0; - struct kset *device_kset = NULL; - struct kset *fcheck_kset = NULL; - struct ocfs2_filecheck *fcheck = NULL; - struct ocfs2_filecheck_sysfs_entry *entry = NULL; - struct attribute **attrs = NULL; - struct attribute_group attrgp; - - if (!ocfs2_kset) - return -ENOMEM; - - attrs = kmalloc(sizeof(struct attribute *) * 4, GFP_NOFS); - if (!attrs) { - ret = -ENOMEM; - goto error; - } else { - attrs[0] = &ocfs2_attr_filecheck_chk.attr; - attrs[1] = &ocfs2_attr_filecheck_fix.attr; - attrs[2] = &ocfs2_attr_filecheck_set.attr; - attrs[3] = NULL; - memset(&attrgp, 0, sizeof(attrgp)); - attrgp.attrs = attrs; - } + int ret; + struct ocfs2_filecheck *fcheck; + struct ocfs2_filecheck_sysfs_entry *entry = &osb->osb_fc_ent; fcheck = kmalloc(sizeof(struct ocfs2_filecheck), GFP_NOFS); - if (!fcheck) { - ret = -ENOMEM; - goto error; - } else { - INIT_LIST_HEAD(&fcheck->fc_head); - spin_lock_init(&fcheck->fc_lock); - fcheck->fc_max = OCFS2_FILECHECK_MINSIZE; - fcheck->fc_size = 0; - fcheck->fc_done = 0; - } - - if (strlen(sb->s_id) <= 0) { - mlog(ML_ERROR, - "Cannot get device basename when create filecheck sysfs\n"); - ret = -ENODEV; - goto error; - } - - device_kset = kset_create_and_add(sb->s_id, NULL, &ocfs2_kset->kobj); - if (!device_kset) { - ret = -ENOMEM; - goto error; - } - - fcheck_kset = kset_create_and_add("filecheck", NULL, - &device_kset->kobj); - if (!fcheck_kset) { - ret = -ENOMEM; - goto error; - } - - ret = sysfs_create_group(&fcheck_kset->kobj, &attrgp); - if (ret) - goto error; + if (!fcheck) + return -ENOMEM; - entry = kmalloc(sizeof(struct ocfs2_filecheck_sysfs_entry), GFP_NOFS); - if (!entry) { - ret = -ENOMEM; - goto error; - } else { - atomic_set(&entry->fs_count, 1); - entry->fs_sb = sb; - entry->fs_devicekset = device_kset; - entry->fs_fcheckkset = fcheck_kset; - entry->fs_fcheck = fcheck; - ocfs2_filecheck_sysfs_add(entry); + INIT_LIST_HEAD(&fcheck->fc_head); + spin_lock_init(&fcheck->fc_lock); + fcheck->fc_max = OCFS2_FILECHECK_MINSIZE; + fcheck->fc_size = 0; + fcheck->fc_done = 0; + + entry->fs_kobj.kset = osb->osb_dev_kset; + init_completion(&entry->fs_kobj_unregister); + ret = kobject_init_and_add(&entry->fs_kobj, &ocfs2_ktype_filecheck, + NULL, "filecheck"); + if (ret) { + kfree(fcheck); + return ret; } - kfree(attrs); + entry->fs_fcheck = fcheck; return 0; - -error: - kfree(attrs); - kfree(entry); - kfree(fcheck); - kset_unregister(fcheck_kset); - kset_unregister(device_kset); - return ret; } -int ocfs2_filecheck_remove_sysfs(struct super_block *sb) +void ocfs2_filecheck_remove_sysfs(struct ocfs2_super *osb) { - return ocfs2_filecheck_sysfs_del(sb->s_id); + if (!osb->osb_fc_ent.fs_fcheck) + return; + + kobject_del(&osb->osb_fc_ent.fs_kobj); + kobject_put(&osb->osb_fc_ent.fs_kobj); + wait_for_completion(&osb->osb_fc_ent.fs_kobj_unregister); + ocfs2_filecheck_sysfs_free(&osb->osb_fc_ent); } static int @@ -310,7 +226,7 @@ ocfs2_filecheck_adjust_max(struct ocfs2_filecheck_sysfs_entry *ent, spin_lock(&ent->fs_fcheck->fc_lock); if (len < (ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done)) { - mlog(ML_ERROR, + mlog(ML_NOTICE, "Cannot set online file check maximum entry number " "to %u due to too many pending entries(%u)\n", len, ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done); @@ -387,7 +303,7 @@ ocfs2_filecheck_args_parse(const char *name, const char *buf, size_t count, return 0; } -static ssize_t ocfs2_filecheck_show(struct kobject *kobj, +static ssize_t ocfs2_filecheck_attr_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -395,19 +311,12 @@ static ssize_t ocfs2_filecheck_show(struct kobject *kobj, ssize_t ret = 0, total = 0, remain = PAGE_SIZE; unsigned int type; struct ocfs2_filecheck_entry *p; - struct ocfs2_filecheck_sysfs_entry *ent; + struct ocfs2_filecheck_sysfs_entry *ent = container_of(kobj, + struct ocfs2_filecheck_sysfs_entry, fs_kobj); if (ocfs2_filecheck_type_parse(attr->attr.name, &type)) return -EINVAL; - ent = ocfs2_filecheck_sysfs_get(kobj->parent->name); - if (!ent) { - mlog(ML_ERROR, - "Cannot get the corresponding entry via device basename %s\n", - kobj->name); - return -ENODEV; - } - if (type == OCFS2_FILECHECK_TYPE_SET) { spin_lock(&ent->fs_fcheck->fc_lock); total = snprintf(buf, remain, "%u\n", ent->fs_fcheck->fc_max); @@ -441,11 +350,26 @@ static ssize_t ocfs2_filecheck_show(struct kobject *kobj, spin_unlock(&ent->fs_fcheck->fc_lock); exit: - ocfs2_filecheck_sysfs_put(ent); return total; } -static int +static inline int +ocfs2_filecheck_is_dup_entry(struct ocfs2_filecheck_sysfs_entry *ent, + unsigned long ino) +{ + struct ocfs2_filecheck_entry *p; + + list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) { + if (!p->fe_done) { + if (p->fe_ino == ino) + return 1; + } + } + + return 0; +} + +static inline int ocfs2_filecheck_erase_entry(struct ocfs2_filecheck_sysfs_entry *ent) { struct ocfs2_filecheck_entry *p; @@ -484,21 +408,21 @@ static void ocfs2_filecheck_done_entry(struct ocfs2_filecheck_sysfs_entry *ent, struct ocfs2_filecheck_entry *entry) { - entry->fe_done = 1; spin_lock(&ent->fs_fcheck->fc_lock); + entry->fe_done = 1; ent->fs_fcheck->fc_done++; spin_unlock(&ent->fs_fcheck->fc_lock); } static unsigned int -ocfs2_filecheck_handle(struct super_block *sb, +ocfs2_filecheck_handle(struct ocfs2_super *osb, unsigned long ino, unsigned int flags) { unsigned int ret = OCFS2_FILECHECK_ERR_SUCCESS; struct inode *inode = NULL; int rc; - inode = ocfs2_iget(OCFS2_SB(sb), ino, flags, 0); + inode = ocfs2_iget(osb, ino, flags, 0); if (IS_ERR(inode)) { rc = (int)(-(long)inode); if (rc >= OCFS2_FILECHECK_ERR_START && @@ -516,11 +440,14 @@ static void ocfs2_filecheck_handle_entry(struct ocfs2_filecheck_sysfs_entry *ent, struct ocfs2_filecheck_entry *entry) { + struct ocfs2_super *osb = container_of(ent, struct ocfs2_super, + osb_fc_ent); + if (entry->fe_type == OCFS2_FILECHECK_TYPE_CHK) - entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb, + entry->fe_status = ocfs2_filecheck_handle(osb, entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_CHK); else if (entry->fe_type == OCFS2_FILECHECK_TYPE_FIX) - entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb, + entry->fe_status = ocfs2_filecheck_handle(osb, entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_FIX); else entry->fe_status = OCFS2_FILECHECK_ERR_UNSUPPORTED; @@ -528,30 +455,21 @@ ocfs2_filecheck_handle_entry(struct ocfs2_filecheck_sysfs_entry *ent, ocfs2_filecheck_done_entry(ent, entry); } -static ssize_t ocfs2_filecheck_store(struct kobject *kobj, +static ssize_t ocfs2_filecheck_attr_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { + ssize_t ret = 0; struct ocfs2_filecheck_args args; struct ocfs2_filecheck_entry *entry; - struct ocfs2_filecheck_sysfs_entry *ent; - ssize_t ret = 0; + struct ocfs2_filecheck_sysfs_entry *ent = container_of(kobj, + struct ocfs2_filecheck_sysfs_entry, fs_kobj); if (count == 0) return count; - if (ocfs2_filecheck_args_parse(attr->attr.name, buf, count, &args)) { - mlog(ML_ERROR, "Invalid arguments for online file check\n"); + if (ocfs2_filecheck_args_parse(attr->attr.name, buf, count, &args)) return -EINVAL; - } - - ent = ocfs2_filecheck_sysfs_get(kobj->parent->name); - if (!ent) { - mlog(ML_ERROR, - "Cannot get the corresponding entry via device basename %s\n", - kobj->parent->name); - return -ENODEV; - } if (args.fa_type == OCFS2_FILECHECK_TYPE_SET) { ret = ocfs2_filecheck_adjust_max(ent, args.fa_len); @@ -565,13 +483,16 @@ static ssize_t ocfs2_filecheck_store(struct kobject *kobj, } spin_lock(&ent->fs_fcheck->fc_lock); - if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) && - (ent->fs_fcheck->fc_done == 0)) { - mlog(ML_ERROR, + if (ocfs2_filecheck_is_dup_entry(ent, args.fa_ino)) { + ret = -EEXIST; + kfree(entry); + } else if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) && + (ent->fs_fcheck->fc_done == 0)) { + mlog(ML_NOTICE, "Cannot do more file check " "since file check queue(%u) is full now\n", ent->fs_fcheck->fc_max); - ret = -EBUSY; + ret = -EAGAIN; kfree(entry); } else { if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) && @@ -596,6 +517,5 @@ static ssize_t ocfs2_filecheck_store(struct kobject *kobj, ocfs2_filecheck_handle_entry(ent, entry); exit: - ocfs2_filecheck_sysfs_put(ent); return (!ret ? count : ret); } diff --git a/fs/ocfs2/filecheck.h b/fs/ocfs2/filecheck.h index e5cd002a2c09..6a22ee79e8d0 100644 --- a/fs/ocfs2/filecheck.h +++ b/fs/ocfs2/filecheck.h @@ -43,7 +43,32 @@ enum { #define OCFS2_FILECHECK_ERR_START OCFS2_FILECHECK_ERR_FAILED #define OCFS2_FILECHECK_ERR_END OCFS2_FILECHECK_ERR_UNSUPPORTED -int ocfs2_filecheck_create_sysfs(struct super_block *sb); -int ocfs2_filecheck_remove_sysfs(struct super_block *sb); +struct ocfs2_filecheck { + struct list_head fc_head; /* File check entry list head */ + spinlock_t fc_lock; + unsigned int fc_max; /* Maximum number of entry in list */ + unsigned int fc_size; /* Current entry count in list */ + unsigned int fc_done; /* Finished entry count in list */ +}; + +#define OCFS2_FILECHECK_MAXSIZE 100 +#define OCFS2_FILECHECK_MINSIZE 10 + +/* File check operation type */ +enum { + OCFS2_FILECHECK_TYPE_CHK = 0, /* Check a file(inode) */ + OCFS2_FILECHECK_TYPE_FIX, /* Fix a file(inode) */ + OCFS2_FILECHECK_TYPE_SET = 100 /* Set entry list maximum size */ +}; + +struct ocfs2_filecheck_sysfs_entry { /* sysfs entry per partition */ + struct kobject fs_kobj; + struct completion fs_kobj_unregister; + struct ocfs2_filecheck *fs_fcheck; +}; + + +int ocfs2_filecheck_create_sysfs(struct ocfs2_super *osb); +void ocfs2_filecheck_remove_sysfs(struct ocfs2_super *osb); #endif /* FILECHECK_H */ diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index d51b80edd972..ddc3e9470c87 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1135,7 +1135,7 @@ static void ocfs2_clear_inode(struct inode *inode) trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno, inode->i_nlink); - mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, + mlog_bug_on_msg(osb == NULL, "Inode=%lu\n", inode->i_ino); dquot_drop(inode); @@ -1150,7 +1150,7 @@ static void ocfs2_clear_inode(struct inode *inode) ocfs2_mark_lockres_freeing(osb, &oi->ip_inode_lockres); ocfs2_mark_lockres_freeing(osb, &oi->ip_open_lockres); - ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap, + ocfs2_resv_discard(&osb->osb_la_resmap, &oi->ip_la_data_resv); ocfs2_resv_init_once(&oi->ip_la_data_resv); @@ -1160,7 +1160,7 @@ static void ocfs2_clear_inode(struct inode *inode) * exception here are successfully wiped inodes - their * metadata can now be considered to be part of the system * inodes from which it came. */ - if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED)) + if (!(oi->ip_flags & OCFS2_INODE_DELETED)) ocfs2_checkpoint_inode(inode); mlog_bug_on_msg(!list_empty(&oi->ip_io_markers), @@ -1223,7 +1223,7 @@ static void ocfs2_clear_inode(struct inode *inode) * the journal is flushed before journal shutdown. Thus it is safe to * have inodes get cleaned up after journal shutdown. */ - jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal, + jbd2_journal_release_jbd_inode(osb->journal->j_journal, &oi->ip_jinode); } diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index c801eddc4bf3..8dd6f703c819 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -525,7 +525,7 @@ static int __ocfs2_mknod_locked(struct inode *dir, * these are used by the support functions here and in * callers. */ inode->i_ino = ino_from_blkno(osb->sb, fe_blkno); - OCFS2_I(inode)->ip_blkno = fe_blkno; + oi->ip_blkno = fe_blkno; spin_lock(&osb->osb_lock); inode->i_generation = osb->s_next_generation++; spin_unlock(&osb->osb_lock); @@ -1186,8 +1186,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, } trace_ocfs2_double_lock_end( - (unsigned long long)OCFS2_I(inode1)->ip_blkno, - (unsigned long long)OCFS2_I(inode2)->ip_blkno); + (unsigned long long)oi1->ip_blkno, + (unsigned long long)oi2->ip_blkno); bail: if (status) diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 6867eef2e06b..4f86ac0027b5 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -50,6 +50,8 @@ #include "reservations.h" +#include "filecheck.h" + /* Caching of metadata buffers */ /* Most user visible OCFS2 inodes will have very few pieces of @@ -472,6 +474,12 @@ struct ocfs2_super * workqueue and schedule on our own. */ struct workqueue_struct *ocfs2_wq; + + /* sysfs directory per partition */ + struct kset *osb_dev_kset; + + /* file check related stuff */ + struct ocfs2_filecheck_sysfs_entry osb_fc_ent; }; #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index e2a11aaece10..2ee76a90ba8f 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -1311,11 +1311,11 @@ DEFINE_OCFS2_FILE_OPS(ocfs2_file_release); DEFINE_OCFS2_FILE_OPS(ocfs2_sync_file); -DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_write); +DEFINE_OCFS2_FILE_OPS(ocfs2_file_write_iter); DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_write); -DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_read); +DEFINE_OCFS2_FILE_OPS(ocfs2_file_read_iter); DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_truncate_file); @@ -1467,7 +1467,7 @@ TRACE_EVENT(ocfs2_prepare_inode_for_write, __entry->saved_pos, __entry->count, __entry->wait) ); -DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret); +DEFINE_OCFS2_INT_EVENT(generic_file_read_iter_ret); /* End of trace events for fs/ocfs2/file.c. */ diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index ab156e35ec00..01c6b3894406 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -573,7 +573,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode, BUG_ON(ocfs2_is_refcount_inode(inode)); trace_ocfs2_create_refcount_tree( - (unsigned long long)OCFS2_I(inode)->ip_blkno); + (unsigned long long)oi->ip_blkno); ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); if (ret) { @@ -3359,7 +3359,7 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context) unsigned int ext_flags; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { + if (!ocfs2_refcount_tree(osb)) { return ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", inode->i_ino); } @@ -3707,7 +3707,7 @@ int ocfs2_add_refcount_flag(struct inode *inode, trace_ocfs2_add_refcount_flag(ref_blocks, credits); if (ref_blocks) { - ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb), + ret = ocfs2_reserve_new_metadata_blocks(osb, ref_blocks, &meta_ac); if (ret) { mlog_errno(ret); @@ -4766,8 +4766,8 @@ static int ocfs2_reflink_inodes_lock(struct inode *s_inode, *bh2 = *bh1; trace_ocfs2_double_lock_end( - (unsigned long long)OCFS2_I(inode1)->ip_blkno, - (unsigned long long)OCFS2_I(inode2)->ip_blkno); + (unsigned long long)oi1->ip_blkno, + (unsigned long long)oi2->ip_blkno); return 0; diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index dae9eb7c441e..d2fb97b173da 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -398,7 +398,7 @@ static int ocfs2_control_do_setnode_msg(struct file *file, static int ocfs2_control_do_setversion_msg(struct file *file, struct ocfs2_control_message_setv *msg) - { +{ long major, minor; char *ptr = NULL; struct ocfs2_control_private *p = file->private_data; diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index d8f5f6ce99dc..f7c972fbed6a 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -79,8 +79,6 @@ static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res) return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset); } -static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); -static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); static int ocfs2_block_group_fill(handle_t *handle, struct inode *alloc_inode, @@ -387,7 +385,7 @@ static int ocfs2_block_group_fill(handle_t *handle, memset(bg, 0, sb->s_blocksize); strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE); - bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); + bg->bg_generation = cpu_to_le32(osb->fs_generation); bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1, osb->s_feature_incompat)); bg->bg_chain = cpu_to_le16(my_chain); @@ -1521,7 +1519,7 @@ static int ocfs2_cluster_group_search(struct inode *inode, OCFS2_I(inode)->ip_clusters, max_bits); } - ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), + ret = ocfs2_block_group_find_clear_bits(osb, group_bh, bits_wanted, max_bits, res); if (ret) @@ -2626,53 +2624,6 @@ int ocfs2_release_clusters(handle_t *handle, _ocfs2_clear_bit); } -static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg) -{ - printk("Block Group:\n"); - printk("bg_signature: %s\n", bg->bg_signature); - printk("bg_size: %u\n", bg->bg_size); - printk("bg_bits: %u\n", bg->bg_bits); - printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count); - printk("bg_chain: %u\n", bg->bg_chain); - printk("bg_generation: %u\n", le32_to_cpu(bg->bg_generation)); - printk("bg_next_group: %llu\n", - (unsigned long long)bg->bg_next_group); - printk("bg_parent_dinode: %llu\n", - (unsigned long long)bg->bg_parent_dinode); - printk("bg_blkno: %llu\n", - (unsigned long long)bg->bg_blkno); -} - -static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe) -{ - int i; - - printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno); - printk("i_signature: %s\n", fe->i_signature); - printk("i_size: %llu\n", - (unsigned long long)fe->i_size); - printk("i_clusters: %u\n", fe->i_clusters); - printk("i_generation: %u\n", - le32_to_cpu(fe->i_generation)); - printk("id1.bitmap1.i_used: %u\n", - le32_to_cpu(fe->id1.bitmap1.i_used)); - printk("id1.bitmap1.i_total: %u\n", - le32_to_cpu(fe->id1.bitmap1.i_total)); - printk("id2.i_chain.cl_cpg: %u\n", fe->id2.i_chain.cl_cpg); - printk("id2.i_chain.cl_bpc: %u\n", fe->id2.i_chain.cl_bpc); - printk("id2.i_chain.cl_count: %u\n", fe->id2.i_chain.cl_count); - printk("id2.i_chain.cl_next_free_rec: %u\n", - fe->id2.i_chain.cl_next_free_rec); - for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) { - printk("fe->id2.i_chain.cl_recs[%d].c_free: %u\n", i, - fe->id2.i_chain.cl_recs[i].c_free); - printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i, - fe->id2.i_chain.cl_recs[i].c_total); - printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i, - (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno); - } -} - /* * For a given allocation, determine which allocators will need to be * accessed, and lock them, reserving the appropriate number of bits. diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index ffa4952d432b..3415e0b09398 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -423,10 +423,10 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait) ocfs2_schedule_truncate_log_flush(osb, 0); } - if (jbd2_journal_start_commit(OCFS2_SB(sb)->journal->j_journal, + if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) { if (wait) - jbd2_log_wait_commit(OCFS2_SB(sb)->journal->j_journal, + jbd2_log_wait_commit(osb->journal->j_journal, target); } return 0; @@ -1161,6 +1161,23 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) ocfs2_complete_mount_recovery(osb); + osb->osb_dev_kset = kset_create_and_add(sb->s_id, NULL, + &ocfs2_kset->kobj); + if (!osb->osb_dev_kset) { + status = -ENOMEM; + mlog(ML_ERROR, "Unable to create device kset %s.\n", sb->s_id); + goto read_super_error; + } + + /* Create filecheck sysfs related directories/files at + * /sys/fs/ocfs2/<devname>/filecheck */ + if (ocfs2_filecheck_create_sysfs(osb)) { + status = -ENOMEM; + mlog(ML_ERROR, "Unable to create filecheck sysfs directory at " + "/sys/fs/ocfs2/%s/filecheck.\n", sb->s_id); + goto read_super_error; + } + if (ocfs2_mount_local(osb)) snprintf(nodestr, sizeof(nodestr), "local"); else @@ -1199,9 +1216,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) /* Start this when the mount is almost sure of being successful */ ocfs2_orphan_scan_start(osb); - /* Create filecheck sysfile /sys/fs/ocfs2/<devname>/filecheck */ - ocfs2_filecheck_create_sysfs(sb); - return status; read_super_error: @@ -1653,7 +1667,6 @@ static void ocfs2_put_super(struct super_block *sb) ocfs2_sync_blockdev(sb); ocfs2_dismount_volume(sb, 0); - ocfs2_filecheck_remove_sysfs(sb); } static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -1768,12 +1781,9 @@ static int ocfs2_initialize_mem_caches(void) NULL); if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep || !ocfs2_qf_chunk_cachep) { - if (ocfs2_inode_cachep) - kmem_cache_destroy(ocfs2_inode_cachep); - if (ocfs2_dquot_cachep) - kmem_cache_destroy(ocfs2_dquot_cachep); - if (ocfs2_qf_chunk_cachep) - kmem_cache_destroy(ocfs2_qf_chunk_cachep); + kmem_cache_destroy(ocfs2_inode_cachep); + kmem_cache_destroy(ocfs2_dquot_cachep); + kmem_cache_destroy(ocfs2_qf_chunk_cachep); return -ENOMEM; } @@ -1787,16 +1797,13 @@ static void ocfs2_free_mem_caches(void) * destroy cache. */ rcu_barrier(); - if (ocfs2_inode_cachep) - kmem_cache_destroy(ocfs2_inode_cachep); + kmem_cache_destroy(ocfs2_inode_cachep); ocfs2_inode_cachep = NULL; - if (ocfs2_dquot_cachep) - kmem_cache_destroy(ocfs2_dquot_cachep); + kmem_cache_destroy(ocfs2_dquot_cachep); ocfs2_dquot_cachep = NULL; - if (ocfs2_qf_chunk_cachep) - kmem_cache_destroy(ocfs2_qf_chunk_cachep); + kmem_cache_destroy(ocfs2_qf_chunk_cachep); ocfs2_qf_chunk_cachep = NULL; } @@ -1899,6 +1906,12 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) osb = OCFS2_SB(sb); BUG_ON(!osb); + /* Remove file check sysfs related directores/files, + * and wait for the pending file check operations */ + ocfs2_filecheck_remove_sysfs(osb); + + kset_unregister(osb->osb_dev_kset); + debugfs_remove(osb->osb_ctxt); /* Orphan scan should be stopped as early as possible */ diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c index 82e17b076ce7..78f09c76ab3c 100644 --- a/fs/ocfs2/uptodate.c +++ b/fs/ocfs2/uptodate.c @@ -633,6 +633,5 @@ int __init init_ocfs2_uptodate_cache(void) void exit_ocfs2_uptodate_cache(void) { - if (ocfs2_uptodate_cachep) - kmem_cache_destroy(ocfs2_uptodate_cachep); + kmem_cache_destroy(ocfs2_uptodate_cachep); } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index c261c1dfd374..3a24ce3deb01 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -3564,7 +3564,7 @@ int ocfs2_xattr_set(struct inode *inode, .not_found = -ENODATA, }; - if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb))) + if (!ocfs2_supports_xattr(osb)) return -EOPNOTSUPP; /* diff --git a/fs/open.c b/fs/open.c index d0e955b558ad..c5ee7cd60424 100644 --- a/fs/open.c +++ b/fs/open.c @@ -724,16 +724,6 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) return ksys_fchown(fd, user, group); } -int open_check_o_direct(struct file *f) -{ - /* NB: we're sure to have correct a_ops only after f_op->open */ - if (f->f_flags & O_DIRECT) { - if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO) - return -EINVAL; - } - return 0; -} - static int do_dentry_open(struct file *f, struct inode *inode, int (*open)(struct inode *, struct file *), @@ -755,7 +745,7 @@ static int do_dentry_open(struct file *f, if (unlikely(f->f_flags & O_PATH)) { f->f_mode = FMODE_PATH; f->f_op = &empty_fops; - return 0; + goto done; } if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { @@ -808,7 +798,12 @@ static int do_dentry_open(struct file *f, f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); - +done: + /* NB: we're sure to have correct a_ops only after f_op->open */ + error = -EINVAL; + if ((f->f_flags & O_DIRECT) && + (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)) + goto out_fput; return 0; cleanup_all: @@ -823,6 +818,9 @@ cleanup_file: f->f_path.dentry = NULL; f->f_inode = NULL; return error; +out_fput: + fput(f); + return error; } /** @@ -920,20 +918,14 @@ struct file *dentry_open(const struct path *path, int flags, BUG_ON(!path->mnt); f = get_empty_filp(); - if (!IS_ERR(f)) { - f->f_flags = flags; - error = vfs_open(path, f, cred); - if (!error) { - /* from now on we need fput() to dispose of f */ - error = open_check_o_direct(f); - if (error) { - fput(f); - f = ERR_PTR(error); - } - } else { - put_filp(f); - f = ERR_PTR(error); - } + if (IS_ERR(f)) + return f; + + f->f_flags = flags; + error = vfs_open(path, f, cred); + if (error) { + put_filp(f); + return ERR_PTR(error); } return f; } diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c index 480ea059a680..10587413b20e 100644 --- a/fs/orangefs/acl.c +++ b/fs/orangefs/acl.c @@ -9,7 +9,6 @@ #include "orangefs-kernel.h" #include "orangefs-bufmap.h" #include <linux/posix_acl_xattr.h> -#include <linux/fs_struct.h> struct posix_acl *orangefs_get_acl(struct inode *inode, int type) { diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c index b03057afac2a..66369ec90020 100644 --- a/fs/orangefs/devorangefs-req.c +++ b/fs/orangefs/devorangefs-req.c @@ -463,11 +463,10 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb, if (op->downcall.type != ORANGEFS_VFS_OP_READDIR) goto wakeup; - op->downcall.trailer_buf = vmalloc(op->downcall.trailer_size); + op->downcall.trailer_buf = vzalloc(op->downcall.trailer_size); if (!op->downcall.trailer_buf) goto Enomem; - memset(op->downcall.trailer_buf, 0, op->downcall.trailer_size); if (!copy_from_iter_full(op->downcall.trailer_buf, op->downcall.trailer_size, iter)) { gossip_err("%s: failed to copy trailer.\n", __func__); @@ -779,9 +778,35 @@ static long orangefs_devreq_compat_ioctl(struct file *filp, unsigned int cmd, #endif /* CONFIG_COMPAT is in .config */ +static __poll_t orangefs_devreq_poll(struct file *file, + struct poll_table_struct *poll_table) +{ + __poll_t poll_revent_mask = 0; + + poll_wait(file, &orangefs_request_list_waitq, poll_table); + + if (!list_empty(&orangefs_request_list)) + poll_revent_mask |= EPOLLIN; + return poll_revent_mask; +} + /* the assigned character device major number */ static int orangefs_dev_major; +static const struct file_operations orangefs_devreq_file_operations = { + .owner = THIS_MODULE, + .read = orangefs_devreq_read, + .write_iter = orangefs_devreq_write_iter, + .open = orangefs_devreq_open, + .release = orangefs_devreq_release, + .unlocked_ioctl = orangefs_devreq_ioctl, + +#ifdef CONFIG_COMPAT /* CONFIG_COMPAT is in .config */ + .compat_ioctl = orangefs_devreq_compat_ioctl, +#endif + .poll = orangefs_devreq_poll +}; + /* * Initialize orangefs device specific state: * Must be called at module load time only @@ -814,29 +839,3 @@ void orangefs_dev_cleanup(void) "*** /dev/%s character device unregistered ***\n", ORANGEFS_REQDEVICE_NAME); } - -static __poll_t orangefs_devreq_poll(struct file *file, - struct poll_table_struct *poll_table) -{ - __poll_t poll_revent_mask = 0; - - poll_wait(file, &orangefs_request_list_waitq, poll_table); - - if (!list_empty(&orangefs_request_list)) - poll_revent_mask |= EPOLLIN; - return poll_revent_mask; -} - -const struct file_operations orangefs_devreq_file_operations = { - .owner = THIS_MODULE, - .read = orangefs_devreq_read, - .write_iter = orangefs_devreq_write_iter, - .open = orangefs_devreq_open, - .release = orangefs_devreq_release, - .unlocked_ioctl = orangefs_devreq_ioctl, - -#ifdef CONFIG_COMPAT /* CONFIG_COMPAT is in .config */ - .compat_ioctl = orangefs_devreq_compat_ioctl, -#endif - .poll = orangefs_devreq_poll -}; diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 0d228cd087e6..26358efbf794 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -42,70 +42,6 @@ static int flush_racache(struct inode *inode) } /* - * Copy to client-core's address space from the buffers specified - * by the iovec upto total_size bytes. - * NOTE: the iovector can either contain addresses which - * can futher be kernel-space or user-space addresses. - * or it can pointers to struct page's - */ -static int precopy_buffers(int buffer_index, - struct iov_iter *iter, - size_t total_size) -{ - int ret = 0; - /* - * copy data from application/kernel by pulling it out - * of the iovec. - */ - - - if (total_size) { - ret = orangefs_bufmap_copy_from_iovec(iter, - buffer_index, - total_size); - if (ret < 0) - gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n", - __func__, - (long)ret); - } - - if (ret < 0) - gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n", - __func__, - (long)ret); - return ret; -} - -/* - * Copy from client-core's address space to the buffers specified - * by the iovec upto total_size bytes. - * NOTE: the iovector can either contain addresses which - * can futher be kernel-space or user-space addresses. - * or it can pointers to struct page's - */ -static int postcopy_buffers(int buffer_index, - struct iov_iter *iter, - size_t total_size) -{ - int ret = 0; - /* - * copy data to application/kernel by pushing it out to - * the iovec. NOTE; target buffers can be addresses or - * struct page pointers. - */ - if (total_size) { - ret = orangefs_bufmap_copy_to_iovec(iter, - buffer_index, - total_size); - if (ret < 0) - gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n", - __func__, - (long)ret); - } - return ret; -} - -/* * Post and wait for the I/O upcall to finish */ static ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode, @@ -157,14 +93,15 @@ populate_shared_memory: total_size); /* * Stage 1: copy the buffers into client-core's address space - * precopy_buffers only pertains to writes. */ - if (type == ORANGEFS_IO_WRITE) { - ret = precopy_buffers(buffer_index, - iter, - total_size); - if (ret < 0) + if (type == ORANGEFS_IO_WRITE && total_size) { + ret = orangefs_bufmap_copy_from_iovec(iter, buffer_index, + total_size); + if (ret < 0) { + gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n", + __func__, (long)ret); goto out; + } } gossip_debug(GOSSIP_FILE_DEBUG, @@ -260,14 +197,20 @@ populate_shared_memory: /* * Stage 3: Post copy buffers from client-core's address space - * postcopy_buffers only pertains to reads. */ - if (type == ORANGEFS_IO_READ) { - ret = postcopy_buffers(buffer_index, - iter, - new_op->downcall.resp.io.amt_complete); - if (ret < 0) + if (type == ORANGEFS_IO_READ && new_op->downcall.resp.io.amt_complete) { + /* + * NOTE: the iovector can either contain addresses which + * can futher be kernel-space or user-space addresses. + * or it can pointers to struct page's + */ + ret = orangefs_bufmap_copy_to_iovec(iter, buffer_index, + new_op->downcall.resp.io.amt_complete); + if (ret < 0) { + gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n", + __func__, (long)ret); goto out; + } } gossip_debug(GOSSIP_FILE_DEBUG, "%s(%pU): Amount %s, returned by the sys-io call:%d\n", @@ -585,6 +528,28 @@ static long orangefs_ioctl(struct file *file, unsigned int cmd, unsigned long ar return ret; } +static int orangefs_fault(struct vm_fault *vmf) +{ + struct file *file = vmf->vma->vm_file; + int rc; + rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1, + STATX_SIZE); + if (rc == -ESTALE) + rc = -EIO; + if (rc) { + gossip_err("%s: orangefs_inode_getattr failed, " + "rc:%d:.\n", __func__, rc); + return rc; + } + return filemap_fault(vmf); +} + +const struct vm_operations_struct orangefs_file_vm_ops = { + .fault = orangefs_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = filemap_page_mkwrite, +}; + /* * Memory map a region of a file. */ @@ -596,12 +561,16 @@ static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma) (char *)file->f_path.dentry->d_name.name : (char *)"Unknown")); + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) + return -EINVAL; + /* set the sequential readahead hint */ vma->vm_flags |= VM_SEQ_READ; vma->vm_flags &= ~VM_RAND_READ; - /* Use readonly mmap since we cannot support writable maps. */ - return generic_file_readonly_mmap(file, vma); + file_accessed(file); + vma->vm_ops = &orangefs_file_vm_ops; + return 0; } #define mapping_nrpages(idata) ((idata)->nrpages) diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index fe1d705ad91f..79c61da8b1bc 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -138,7 +138,7 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb, } /** ORANGEFS2 implementation of address space operations */ -const struct address_space_operations orangefs_address_operations = { +static const struct address_space_operations orangefs_address_operations = { .readpage = orangefs_readpage, .readpages = orangefs_readpages, .invalidatepage = orangefs_invalidatepage, @@ -307,7 +307,7 @@ int orangefs_update_time(struct inode *inode, struct timespec *time, int flags) } /* ORANGEDS2 implementation of VFS inode operations for files */ -const struct inode_operations orangefs_file_inode_operations = { +static const struct inode_operations orangefs_file_inode_operations = { .get_acl = orangefs_get_acl, .set_acl = orangefs_set_acl, .setattr = orangefs_setattr, diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c index 59f444dced9b..4f927023d095 100644 --- a/fs/orangefs/orangefs-bufmap.c +++ b/fs/orangefs/orangefs-bufmap.c @@ -71,9 +71,9 @@ static void put(struct slot_map *m, int slot) spin_lock(&m->q.lock); __clear_bit(slot, m->map); v = ++m->c; - if (unlikely(v == 1)) /* no free slots -> one free slot */ + if (v > 0) wake_up_locked(&m->q); - else if (unlikely(v == -1)) /* finished dying */ + if (unlikely(v == -1)) /* finished dying */ wake_up_all_locked(&m->q); spin_unlock(&m->q.lock); } diff --git a/fs/orangefs/orangefs-debug.h b/fs/orangefs/orangefs-debug.h index c7db56a31b92..6e079d4230d0 100644 --- a/fs/orangefs/orangefs-debug.h +++ b/fs/orangefs/orangefs-debug.h @@ -43,12 +43,6 @@ #define GOSSIP_MAX_NR 16 #define GOSSIP_MAX_DEBUG (((__u64)1 << GOSSIP_MAX_NR) - 1) -/*function prototypes*/ -__u64 ORANGEFS_kmod_eventlog_to_mask(const char *event_logging); -__u64 ORANGEFS_debug_eventlog_to_mask(const char *event_logging); -char *ORANGEFS_debug_mask_to_eventlog(__u64 mask); -char *ORANGEFS_kmod_mask_to_eventlog(__u64 mask); - /* a private internal type */ struct __keyword_mask_s { const char *keyword; diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index eebbaece85ef..c29bb0ebc6bb 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -65,11 +65,7 @@ #define ORANGEFS_REQDEVICE_NAME "pvfs2-req" #define ORANGEFS_DEVREQ_MAGIC 0x20030529 -#define ORANGEFS_LINK_MAX 0x000000FF #define ORANGEFS_PURGE_RETRY_COUNT 0x00000005 -#define ORANGEFS_MAX_NUM_OPTIONS 0x00000004 -#define ORANGEFS_MAX_MOUNT_OPT_LEN 0x00000080 -#define ORANGEFS_MAX_FSKEY_LEN 64 #define MAX_DEV_REQ_UPSIZE (2 * sizeof(__s32) + \ sizeof(__u64) + sizeof(struct orangefs_upcall_s)) @@ -113,15 +109,6 @@ extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type); extern int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type); /* - * Redefine xtvec structure so that we could move helper functions out of - * the define - */ -struct xtvec { - __kernel_off_t xtv_off; /* must be off_t */ - __kernel_size_t xtv_len; /* must be size_t */ -}; - -/* * orangefs data structures */ struct orangefs_kernel_op_s { @@ -224,39 +211,6 @@ struct orangefs_sb_info_s { struct list_head list; }; -/* - * structure that holds the state of any async I/O operation issued - * through the VFS. Needed especially to handle cancellation requests - * or even completion notification so that the VFS client-side daemon - * can free up its vfs_request slots. - */ -struct orangefs_kiocb_s { - /* the pointer to the task that initiated the AIO */ - struct task_struct *tsk; - - /* pointer to the kiocb that kicked this operation */ - struct kiocb *kiocb; - - /* buffer index that was used for the I/O */ - struct orangefs_bufmap *bufmap; - int buffer_index; - - /* orangefs kernel operation type */ - struct orangefs_kernel_op_s *op; - - /* set to indicate the type of the operation */ - int rw; - - /* file offset */ - loff_t offset; - - /* and the count in bytes */ - size_t bytes_to_be_copied; - - ssize_t bytes_copied; - int needs_cleanup; -}; - struct orangefs_stats { unsigned long cache_hits; unsigned long cache_misses; @@ -305,21 +259,6 @@ static inline struct orangefs_khandle *get_khandle_from_ino(struct inode *inode) return &(ORANGEFS_I(inode)->refn.khandle); } -static inline ino_t get_ino_from_khandle(struct inode *inode) -{ - struct orangefs_khandle *khandle; - ino_t ino; - - khandle = get_khandle_from_ino(inode); - ino = orangefs_khandle_to_ino(khandle); - return ino; -} - -static inline ino_t get_parent_ino_from_dentry(struct dentry *dentry) -{ - return get_ino_from_khandle(dentry->d_parent->d_inode); -} - static inline int is_root_handle(struct inode *inode) { gossip_debug(GOSSIP_DCACHE_DEBUG, @@ -391,7 +330,6 @@ void fsid_key_table_finalize(void); /* * defined in inode.c */ -__u32 convert_to_orangefs_mask(unsigned long lite_mask); struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir, int mode, @@ -410,17 +348,6 @@ int orangefs_update_time(struct inode *, struct timespec *, int); /* * defined in xattr.c */ -int orangefs_setxattr(struct dentry *dentry, - const char *name, - const void *value, - size_t size, - int flags); - -ssize_t orangefs_getxattr(struct dentry *dentry, - const char *name, - void *buffer, - size_t size); - ssize_t orangefs_listxattr(struct dentry *dentry, char *buffer, size_t size); /* @@ -467,8 +394,6 @@ int orangefs_inode_check_changed(struct inode *inode); int orangefs_inode_setattr(struct inode *inode, struct iattr *iattr); -int orangefs_unmount_sb(struct super_block *sb); - bool orangefs_cancel_op_in_progress(struct orangefs_kernel_op_s *op); int orangefs_normalize_to_errno(__s32 error_code); @@ -487,16 +412,11 @@ extern struct list_head *orangefs_htable_ops_in_progress; extern spinlock_t orangefs_htable_ops_in_progress_lock; extern int hash_table_size; -extern const struct address_space_operations orangefs_address_operations; -extern const struct inode_operations orangefs_file_inode_operations; extern const struct file_operations orangefs_file_operations; extern const struct inode_operations orangefs_symlink_inode_operations; extern const struct inode_operations orangefs_dir_inode_operations; extern const struct file_operations orangefs_dir_operations; extern const struct dentry_operations orangefs_dentry_operations; -extern const struct file_operations orangefs_devreq_file_operations; - -extern wait_queue_head_t orangefs_bufmap_init_waitq; /* * misc convenience macros diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index ea6256d136d1..00fadaf0da8f 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -500,7 +500,7 @@ int orangefs_normalize_to_errno(__s32 error_code) * server. */ } else if (error_code > 0) { - gossip_err("orangefs: error status receieved.\n"); + gossip_err("orangefs: error status received.\n"); gossip_err("orangefs: assuming error code is inverted.\n"); error_code = -error_code; } diff --git a/fs/orangefs/protocol.h b/fs/orangefs/protocol.h index dc6e3e6269c3..61ee8d64c842 100644 --- a/fs/orangefs/protocol.h +++ b/fs/orangefs/protocol.h @@ -5,11 +5,6 @@ #include <linux/slab.h> #include <linux/ioctl.h> -/* pvfs2-config.h ***********************************************************/ -#define ORANGEFS_VERSION_MAJOR 2 -#define ORANGEFS_VERSION_MINOR 9 -#define ORANGEFS_VERSION_SUB 0 - /* khandle stuff ***********************************************************/ /* @@ -70,16 +65,6 @@ static inline void ORANGEFS_khandle_from(struct orangefs_khandle *kh, } /* pvfs2-types.h ************************************************************/ -typedef __u32 ORANGEFS_uid; -typedef __u32 ORANGEFS_gid; -typedef __s32 ORANGEFS_fs_id; -typedef __u32 ORANGEFS_permissions; -typedef __u64 ORANGEFS_time; -typedef __s64 ORANGEFS_size; -typedef __u64 ORANGEFS_flags; -typedef __u64 ORANGEFS_ds_position; -typedef __s32 ORANGEFS_error; -typedef __s64 ORANGEFS_offset; #define ORANGEFS_SUPER_MAGIC 0x20030528 @@ -145,7 +130,6 @@ typedef __s64 ORANGEFS_offset; #define ORANGEFS_APPEND_FL FS_APPEND_FL #define ORANGEFS_NOATIME_FL FS_NOATIME_FL #define ORANGEFS_MIRROR_FL 0x01000000ULL -#define ORANGEFS_O_EXECUTE (1 << 0) #define ORANGEFS_FS_ID_NULL ((__s32)0) #define ORANGEFS_ATTR_SYS_UID (1 << 0) #define ORANGEFS_ATTR_SYS_GID (1 << 1) @@ -229,35 +213,6 @@ enum orangefs_ds_type { ORANGEFS_TYPE_INTERNAL = (1 << 5) /* for the server's private use */ }; -/* - * ORANGEFS_certificate simply stores a buffer with the buffer size. - * The buffer can be converted to an OpenSSL X509 struct for use. - */ -struct ORANGEFS_certificate { - __u32 buf_size; - unsigned char *buf; -}; - -/* - * A credential identifies a user and is signed by the client/user - * private key. - */ -struct ORANGEFS_credential { - __u32 userid; /* user id */ - __u32 num_groups; /* length of group_array */ - __u32 *group_array; /* groups for which the user is a member */ - char *issuer; /* alias of the issuing server */ - __u64 timeout; /* seconds after epoch to time out */ - __u32 sig_size; /* length of the signature in bytes */ - unsigned char *signature; /* digital signature */ - struct ORANGEFS_certificate certificate; /* user certificate buffer */ -}; -#define extra_size_ORANGEFS_credential (ORANGEFS_REQ_LIMIT_GROUPS * \ - sizeof(__u32) + \ - ORANGEFS_REQ_LIMIT_ISSUER + \ - ORANGEFS_REQ_LIMIT_SIGNATURE + \ - extra_size_ORANGEFS_certificate) - /* This structure is used by the VFS-client interaction alone */ struct ORANGEFS_keyval_pair { char key[ORANGEFS_MAX_XATTR_NAMELEN]; diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index ce6ff5a0a6e4..17032631c5cf 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -86,3 +86,20 @@ config OVERLAY_FS_NFS_EXPORT case basis with the "nfs_export=on" mount option. Say N unless you fully understand the consequences. + +config OVERLAY_FS_XINO_AUTO + bool "Overlayfs: auto enable inode number mapping" + default n + depends on OVERLAY_FS + help + If this config option is enabled then overlay filesystems will use + unused high bits in undelying filesystem inode numbers to map all + inodes to a unified address space. The mapped 64bit inode numbers + might not be compatible with applications that expect 32bit inodes. + + If compatibility with applications that expect 32bit inodes is not an + issue, then it is safe and recommended to say Y here. + + For more information, see Documentation/filesystems/overlayfs.txt + + If unsure, say N. diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index d855f508fa20..8bede0742619 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -232,7 +232,7 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) return err; } -struct ovl_fh *ovl_encode_fh(struct dentry *real, bool is_upper) +struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper) { struct ovl_fh *fh; int fh_type, fh_len, dwords; @@ -300,7 +300,7 @@ int ovl_set_origin(struct dentry *dentry, struct dentry *lower, * up and a pure upper inode. */ if (ovl_can_decode_fh(lower->d_sb)) { - fh = ovl_encode_fh(lower, false); + fh = ovl_encode_real_fh(lower, false); if (IS_ERR(fh)) return PTR_ERR(fh); } @@ -321,7 +321,7 @@ static int ovl_set_upper_fh(struct dentry *upper, struct dentry *index) const struct ovl_fh *fh; int err; - fh = ovl_encode_fh(upper, true); + fh = ovl_encode_real_fh(upper, true); if (IS_ERR(fh)) return PTR_ERR(fh); diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 87bd4148f4fb..425a94672300 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -228,8 +228,8 @@ static int ovl_d_to_fh(struct dentry *dentry, char *buf, int buflen) goto fail; /* Encode an upper or lower file handle */ - fh = ovl_encode_fh(enc_lower ? ovl_dentry_lower(dentry) : - ovl_dentry_upper(dentry), !enc_lower); + fh = ovl_encode_real_fh(enc_lower ? ovl_dentry_lower(dentry) : + ovl_dentry_upper(dentry), !enc_lower); err = PTR_ERR(fh); if (IS_ERR(fh)) goto fail; @@ -267,8 +267,8 @@ static int ovl_dentry_to_fh(struct dentry *dentry, u32 *fid, int *max_len) return OVL_FILEID; } -static int ovl_encode_inode_fh(struct inode *inode, u32 *fid, int *max_len, - struct inode *parent) +static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len, + struct inode *parent) { struct dentry *dentry; int type; @@ -305,15 +305,12 @@ static struct dentry *ovl_obtain_alias(struct super_block *sb, if (d_is_dir(upper ?: lower)) return ERR_PTR(-EIO); - inode = ovl_get_inode(sb, dget(upper), lower, index, !!lower); + inode = ovl_get_inode(sb, dget(upper), lowerpath, index, !!lower); if (IS_ERR(inode)) { dput(upper); return ERR_CAST(inode); } - if (index) - ovl_set_flag(OVL_INDEX, inode); - dentry = d_find_any_alias(inode); if (!dentry) { dentry = d_alloc_anon(inode->i_sb); @@ -685,7 +682,7 @@ static struct dentry *ovl_upper_fh_to_d(struct super_block *sb, if (!ofs->upper_mnt) return ERR_PTR(-EACCES); - upper = ovl_decode_fh(fh, ofs->upper_mnt); + upper = ovl_decode_real_fh(fh, ofs->upper_mnt, true); if (IS_ERR_OR_NULL(upper)) return upper; @@ -703,25 +700,39 @@ static struct dentry *ovl_lower_fh_to_d(struct super_block *sb, struct ovl_path *stack = &origin; struct dentry *dentry = NULL; struct dentry *index = NULL; - struct inode *inode = NULL; - bool is_deleted = false; + struct inode *inode; int err; - /* First lookup indexed upper by fh */ + /* First lookup overlay inode in inode cache by origin fh */ + err = ovl_check_origin_fh(ofs, fh, false, NULL, &stack); + if (err) + return ERR_PTR(err); + + if (!d_is_dir(origin.dentry) || + !(origin.dentry->d_flags & DCACHE_DISCONNECTED)) { + inode = ovl_lookup_inode(sb, origin.dentry, false); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_err; + if (inode) { + dentry = d_find_any_alias(inode); + iput(inode); + if (dentry) + goto out; + } + } + + /* Then lookup indexed upper/whiteout by origin fh */ if (ofs->indexdir) { index = ovl_get_index_fh(ofs, fh); err = PTR_ERR(index); if (IS_ERR(index)) { - if (err != -ESTALE) - return ERR_PTR(err); - - /* Found a whiteout index - treat as deleted inode */ - is_deleted = true; index = NULL; + goto out_err; } } - /* Then try to get upper dir by index */ + /* Then try to get a connected upper dir by index */ if (index && d_is_dir(index)) { struct dentry *upper = ovl_index_upper(ofs, index); @@ -734,24 +745,19 @@ static struct dentry *ovl_lower_fh_to_d(struct super_block *sb, goto out; } - /* Then lookup origin by fh */ - err = ovl_check_origin_fh(ofs, fh, NULL, &stack); - if (err) { - goto out_err; - } else if (index) { - err = ovl_verify_origin(index, origin.dentry, false); + /* Otherwise, get a connected non-upper dir or disconnected non-dir */ + if (d_is_dir(origin.dentry) && + (origin.dentry->d_flags & DCACHE_DISCONNECTED)) { + dput(origin.dentry); + origin.dentry = NULL; + err = ovl_check_origin_fh(ofs, fh, true, NULL, &stack); if (err) goto out_err; - } else if (is_deleted) { - /* Lookup deleted non-dir by origin inode */ - if (!d_is_dir(origin.dentry)) - inode = ovl_lookup_inode(sb, origin.dentry, false); - err = -ESTALE; - if (!inode || atomic_read(&inode->i_count) == 1) + } + if (index) { + err = ovl_verify_origin(index, origin.dentry, false); + if (err) goto out_err; - - /* Deleted but still open? */ - index = dget(ovl_i_dentry_upper(inode)); } dentry = ovl_get_dentry(sb, NULL, &origin, index); @@ -759,7 +765,6 @@ static struct dentry *ovl_lower_fh_to_d(struct super_block *sb, out: dput(origin.dentry); dput(index); - iput(inode); return dentry; out_err: @@ -829,7 +834,7 @@ static struct dentry *ovl_get_parent(struct dentry *dentry) } const struct export_operations ovl_export_operations = { - .encode_fh = ovl_encode_inode_fh, + .encode_fh = ovl_encode_fh, .fh_to_dentry = ovl_fh_to_dentry, .fh_to_parent = ovl_fh_to_parent, .get_name = ovl_get_name, diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 3b1bd469accd..6e3815fb006b 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -16,13 +16,6 @@ #include "overlayfs.h" -static dev_t ovl_get_pseudo_dev(struct dentry *dentry) -{ - struct ovl_entry *oe = dentry->d_fsdata; - - return oe->lowerstack[0].layer->pseudo_dev; -} - int ovl_setattr(struct dentry *dentry, struct iattr *attr) { int err; @@ -66,6 +59,69 @@ out: return err; } +static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, + struct ovl_layer *lower_layer) +{ + bool samefs = ovl_same_sb(dentry->d_sb); + unsigned int xinobits = ovl_xino_bits(dentry->d_sb); + + if (samefs) { + /* + * When all layers are on the same fs, all real inode + * number are unique, so we use the overlay st_dev, + * which is friendly to du -x. + */ + stat->dev = dentry->d_sb->s_dev; + return 0; + } else if (xinobits) { + unsigned int shift = 64 - xinobits; + /* + * All inode numbers of underlying fs should not be using the + * high xinobits, so we use high xinobits to partition the + * overlay st_ino address space. The high bits holds the fsid + * (upper fsid is 0). This way overlay inode numbers are unique + * and all inodes use overlay st_dev. Inode numbers are also + * persistent for a given layer configuration. + */ + if (stat->ino >> shift) { + pr_warn_ratelimited("overlayfs: inode number too big (%pd2, ino=%llu, xinobits=%d)\n", + dentry, stat->ino, xinobits); + } else { + if (lower_layer) + stat->ino |= ((u64)lower_layer->fsid) << shift; + + stat->dev = dentry->d_sb->s_dev; + return 0; + } + } + + /* The inode could not be mapped to a unified st_ino address space */ + if (S_ISDIR(dentry->d_inode->i_mode)) { + /* + * Always use the overlay st_dev for directories, so 'find + * -xdev' will scan the entire overlay mount and won't cross the + * overlay mount boundaries. + * + * If not all layers are on the same fs the pair {real st_ino; + * overlay st_dev} is not unique, so use the non persistent + * overlay st_ino for directories. + */ + stat->dev = dentry->d_sb->s_dev; + stat->ino = dentry->d_inode->i_ino; + } else if (lower_layer && lower_layer->fsid) { + /* + * For non-samefs setup, if we cannot map all layers st_ino + * to a unified address space, we need to make sure that st_dev + * is unique per lower fs. Upper layer uses real st_dev and + * lower layers use the unique anonymous bdev assigned to the + * lower fs. + */ + stat->dev = lower_layer->fs->pseudo_dev; + } + + return 0; +} + int ovl_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { @@ -75,6 +131,7 @@ int ovl_getattr(const struct path *path, struct kstat *stat, const struct cred *old_cred; bool is_dir = S_ISDIR(dentry->d_inode->i_mode); bool samefs = ovl_same_sb(dentry->d_sb); + struct ovl_layer *lower_layer = NULL; int err; type = ovl_path_real(dentry, &realpath); @@ -84,14 +141,18 @@ int ovl_getattr(const struct path *path, struct kstat *stat, goto out; /* - * For non-dir or same fs, we use st_ino of the copy up origin, if we - * know it. This guaranties constant st_dev/st_ino across copy up. + * For non-dir or same fs, we use st_ino of the copy up origin. + * This guaranties constant st_dev/st_ino across copy up. + * With xino feature and non-samefs, we use st_ino of the copy up + * origin masked with high bits that represent the layer id. * - * If filesystem supports NFS export ops, this also guaranties + * If lower filesystem supports NFS file handles, this also guaranties * persistent st_ino across mount cycle. */ - if (!is_dir || samefs) { - if (OVL_TYPE_ORIGIN(type)) { + if (!is_dir || samefs || ovl_xino_bits(dentry->d_sb)) { + if (!OVL_TYPE_UPPER(type)) { + lower_layer = ovl_layer_lower(dentry); + } else if (OVL_TYPE_ORIGIN(type)) { struct kstat lowerstat; u32 lowermask = STATX_INO | (!is_dir ? STATX_NLINK : 0); @@ -118,43 +179,17 @@ int ovl_getattr(const struct path *path, struct kstat *stat, */ if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) || (!ovl_verify_lower(dentry->d_sb) && - (is_dir || lowerstat.nlink == 1))) + (is_dir || lowerstat.nlink == 1))) { stat->ino = lowerstat.ino; - - if (samefs) - WARN_ON_ONCE(stat->dev != lowerstat.dev); - else - stat->dev = ovl_get_pseudo_dev(dentry); - } - if (samefs) { - /* - * When all layers are on the same fs, all real inode - * number are unique, so we use the overlay st_dev, - * which is friendly to du -x. - */ - stat->dev = dentry->d_sb->s_dev; - } else if (!OVL_TYPE_UPPER(type)) { - /* - * For non-samefs setup, to make sure that st_dev/st_ino - * pair is unique across the system, we use a unique - * anonymous st_dev for lower layer inode. - */ - stat->dev = ovl_get_pseudo_dev(dentry); + lower_layer = ovl_layer_lower(dentry); + } } - } else { - /* - * Always use the overlay st_dev for directories, so 'find - * -xdev' will scan the entire overlay mount and won't cross the - * overlay mount boundaries. - * - * If not all layers are on the same fs the pair {real st_ino; - * overlay st_dev} is not unique, so use the non persistent - * overlay st_ino for directories. - */ - stat->dev = dentry->d_sb->s_dev; - stat->ino = dentry->d_inode->i_ino; } + err = ovl_map_dev_ino(dentry, stat, lower_layer); + if (err) + goto out; + /* * It's probably not worth it to count subdirs to get the * correct link count. nlink=1 seems to pacify 'find' and @@ -383,24 +418,18 @@ int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags) int ovl_update_time(struct inode *inode, struct timespec *ts, int flags) { - struct dentry *alias; - struct path upperpath; - - if (!(flags & S_ATIME)) - return 0; - - alias = d_find_any_alias(inode); - if (!alias) - return 0; - - ovl_path_upper(alias, &upperpath); - if (upperpath.dentry) { - touch_atime(&upperpath); - inode->i_atime = d_inode(upperpath.dentry)->i_atime; + if (flags & S_ATIME) { + struct ovl_fs *ofs = inode->i_sb->s_fs_info; + struct path upperpath = { + .mnt = ofs->upper_mnt, + .dentry = ovl_upperdentry_dereference(OVL_I(inode)), + }; + + if (upperpath.dentry) { + touch_atime(&upperpath); + inode->i_atime = d_inode(upperpath.dentry)->i_atime; + } } - - dput(alias); - return 0; } @@ -459,9 +488,27 @@ static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode) #endif } -static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev) +static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev, + unsigned long ino, int fsid) { - inode->i_ino = get_next_ino(); + int xinobits = ovl_xino_bits(inode->i_sb); + + /* + * When NFS export is enabled and d_ino is consistent with st_ino + * (samefs or i_ino has enough bits to encode layer), set the same + * value used for d_ino to i_ino, because nfsd readdirplus compares + * d_ino values to i_ino values of child entries. When called from + * ovl_new_inode(), ino arg is 0, so i_ino will be updated to real + * upper inode i_ino on ovl_inode_init() or ovl_inode_update(). + */ + if (inode->i_sb->s_export_op && + (ovl_same_sb(inode->i_sb) || xinobits)) { + inode->i_ino = ino; + if (xinobits && fsid && !(ino >> (64 - xinobits))) + inode->i_ino |= (unsigned long)fsid << (64 - xinobits); + } else { + inode->i_ino = get_next_ino(); + } inode->i_mode = mode; inode->i_flags |= S_NOCMTIME; #ifdef CONFIG_FS_POSIX_ACL @@ -597,7 +644,7 @@ struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev) inode = new_inode(sb); if (inode) - ovl_fill_inode(inode, mode, rdev); + ovl_fill_inode(inode, mode, rdev, 0, 0); return inode; } @@ -703,13 +750,16 @@ static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper, } struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry, - struct dentry *lowerdentry, struct dentry *index, + struct ovl_path *lowerpath, struct dentry *index, unsigned int numlower) { struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL; struct inode *inode; + struct dentry *lowerdentry = lowerpath ? lowerpath->dentry : NULL; bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry, index); + int fsid = bylower ? lowerpath->layer->fsid : 0; bool is_dir; + unsigned long ino = 0; if (!realinode) realinode = d_inode(lowerdentry); @@ -748,18 +798,22 @@ struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry, if (!is_dir) nlink = ovl_get_nlink(lowerdentry, upperdentry, nlink); set_nlink(inode, nlink); + ino = key->i_ino; } else { /* Lower hardlink that will be broken on copy up */ inode = new_inode(sb); if (!inode) goto out_nomem; } - ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev); + ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev, ino, fsid); ovl_inode_init(inode, upperdentry, lowerdentry); if (upperdentry && ovl_is_impuredir(upperdentry)) ovl_set_flag(OVL_IMPURE, inode); + if (index) + ovl_set_flag(OVL_INDEX, inode); + /* Check for non-merge dir that may have whiteouts */ if (is_dir) { if (((upperdentry && lowerdentry) || numlower > 1) || diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 70fcfcc684cc..2dba29eadde6 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -56,6 +56,15 @@ static int ovl_check_redirect(struct dentry *dentry, struct ovl_lookup_data *d, if (s == next) goto invalid; } + /* + * One of the ancestor path elements in an absolute path + * lookup in ovl_lookup_layer() could have been opaque and + * that will stop further lookup in lower layers (d->stop=true) + * But we have found an absolute redirect in decendant path + * element and that should force continue lookup in lower + * layers (reset d->stop). + */ + d->stop = false; } else { if (strchr(buf, '/') != NULL) goto invalid; @@ -171,7 +180,8 @@ invalid: goto out; } -struct dentry *ovl_decode_fh(struct ovl_fh *fh, struct vfsmount *mnt) +struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt, + bool connected) { struct dentry *real; int bytes; @@ -186,7 +196,7 @@ struct dentry *ovl_decode_fh(struct ovl_fh *fh, struct vfsmount *mnt) bytes = (fh->len - offsetof(struct ovl_fh, fid)); real = exportfs_decode_fh(mnt, (struct fid *)fh->fid, bytes >> 2, (int)fh->type, - ovl_acceptable, mnt); + connected ? ovl_acceptable : NULL, mnt); if (IS_ERR(real)) { /* * Treat stale file handle to lower file as "origin unknown". @@ -220,6 +230,7 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, { struct dentry *this; int err; + bool last_element = !post[0]; this = lookup_one_len_unlocked(name, base, namelen); if (IS_ERR(this)) { @@ -245,11 +256,23 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, d->stop = true; if (d->is_dir) goto put_and_out; + + /* + * NB: handle failure to lookup non-last element when non-dir + * redirects become possible + */ + WARN_ON(!last_element); goto out; } - d->is_dir = true; - if (!d->last && ovl_is_opaquedir(this)) { - d->stop = d->opaque = true; + if (last_element) + d->is_dir = true; + if (d->last) + goto out; + + if (ovl_is_opaquedir(this)) { + d->stop = true; + if (last_element) + d->opaque = true; goto out; } err = ovl_check_redirect(this, d, prelen, post); @@ -310,14 +333,15 @@ static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d, } -int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, +int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, struct dentry *upperdentry, struct ovl_path **stackp) { struct dentry *origin = NULL; int i; for (i = 0; i < ofs->numlower; i++) { - origin = ovl_decode_fh(fh, ofs->lower_layers[i].mnt); + origin = ovl_decode_real_fh(fh, ofs->lower_layers[i].mnt, + connected); if (origin) break; } @@ -361,7 +385,7 @@ static int ovl_check_origin(struct ovl_fs *ofs, struct dentry *upperdentry, if (IS_ERR_OR_NULL(fh)) return PTR_ERR(fh); - err = ovl_check_origin_fh(ofs, fh, upperdentry, stackp); + err = ovl_check_origin_fh(ofs, fh, false, upperdentry, stackp); kfree(fh); if (err) { @@ -415,7 +439,7 @@ int ovl_verify_set_fh(struct dentry *dentry, const char *name, struct ovl_fh *fh; int err; - fh = ovl_encode_fh(real, is_upper); + fh = ovl_encode_real_fh(real, is_upper); err = PTR_ERR(fh); if (IS_ERR(fh)) goto fail; @@ -451,7 +475,7 @@ struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index) if (IS_ERR_OR_NULL(fh)) return ERR_CAST(fh); - upper = ovl_decode_fh(fh, ofs->upper_mnt); + upper = ovl_decode_real_fh(fh, ofs->upper_mnt, true); kfree(fh); if (IS_ERR_OR_NULL(upper)) @@ -558,7 +582,7 @@ int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index) /* Check if non-dir index is orphan and don't warn before cleaning it */ if (!d_is_dir(index) && d_inode(index)->i_nlink == 1) { - err = ovl_check_origin_fh(ofs, fh, index, &stack); + err = ovl_check_origin_fh(ofs, fh, false, index, &stack); if (err) goto fail; @@ -619,7 +643,7 @@ int ovl_get_index_name(struct dentry *origin, struct qstr *name) struct ovl_fh *fh; int err; - fh = ovl_encode_fh(origin, false); + fh = ovl_encode_real_fh(origin, false); if (IS_ERR(fh)) return PTR_ERR(fh); @@ -815,7 +839,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, .is_dir = false, .opaque = false, .stop = false, - .last = !poe->numlower, + .last = ofs->config.redirect_follow ? false : !poe->numlower, .redirect = NULL, }; @@ -873,7 +897,11 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, for (i = 0; !d.stop && i < poe->numlower; i++) { struct ovl_path lower = poe->lowerstack[i]; - d.last = i == poe->numlower - 1; + if (!ofs->config.redirect_follow) + d.last = i == poe->numlower - 1; + else + d.last = lower.layer->idx == roe->numlower; + err = ovl_lookup_layer(lower.dentry, &d, &this); if (err) goto out_put; @@ -976,17 +1004,18 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, upperdentry = dget(index); if (upperdentry || ctr) { - if (ctr) - origin = stack[0].dentry; - inode = ovl_get_inode(dentry->d_sb, upperdentry, origin, index, + inode = ovl_get_inode(dentry->d_sb, upperdentry, stack, index, ctr); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_free_oe; + /* + * NB: handle redirected hard links when non-dir redirects + * become possible + */ + WARN_ON(OVL_I(inode)->redirect); OVL_I(inode)->redirect = upperredirect; - if (index) - ovl_set_flag(OVL_INDEX, inode); } revert_creds(old_cred); diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 225ff1171147..e0b7de799f6b 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -202,7 +202,7 @@ void ovl_drop_write(struct dentry *dentry); struct dentry *ovl_workdir(struct dentry *dentry); const struct cred *ovl_override_creds(struct super_block *sb); struct super_block *ovl_same_sb(struct super_block *sb); -bool ovl_can_decode_fh(struct super_block *sb); +int ovl_can_decode_fh(struct super_block *sb); struct dentry *ovl_indexdir(struct super_block *sb); bool ovl_index_all(struct super_block *sb); bool ovl_verify_lower(struct super_block *sb); @@ -215,6 +215,7 @@ void ovl_path_lower(struct dentry *dentry, struct path *path); enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path); struct dentry *ovl_dentry_upper(struct dentry *dentry); struct dentry *ovl_dentry_lower(struct dentry *dentry); +struct ovl_layer *ovl_layer_lower(struct dentry *dentry); struct dentry *ovl_dentry_real(struct dentry *dentry); struct dentry *ovl_i_dentry_upper(struct inode *inode); struct inode *ovl_inode_upper(struct inode *inode); @@ -263,11 +264,19 @@ static inline bool ovl_is_impuredir(struct dentry *dentry) return ovl_check_dir_xattr(dentry, OVL_XATTR_IMPURE); } +static inline unsigned int ovl_xino_bits(struct super_block *sb) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + return ofs->xino_bits; +} + /* namei.c */ int ovl_check_fh_len(struct ovl_fh *fh, int fh_len); -struct dentry *ovl_decode_fh(struct ovl_fh *fh, struct vfsmount *mnt); -int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, +struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt, + bool connected); +int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, struct dentry *upperdentry, struct ovl_path **stackp); int ovl_verify_set_fh(struct dentry *dentry, const char *name, struct dentry *real, bool is_upper, bool set); @@ -329,7 +338,7 @@ struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev); struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real, bool is_upper); struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry, - struct dentry *lowerdentry, struct dentry *index, + struct ovl_path *lowerpath, struct dentry *index, unsigned int numlower); static inline void ovl_copyattr(struct inode *from, struct inode *to) { @@ -361,7 +370,7 @@ int ovl_copy_up(struct dentry *dentry); int ovl_copy_up_flags(struct dentry *dentry, int flags); int ovl_copy_xattr(struct dentry *old, struct dentry *new); int ovl_set_attr(struct dentry *upper, struct kstat *stat); -struct ovl_fh *ovl_encode_fh(struct dentry *real, bool is_upper); +struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper); int ovl_set_origin(struct dentry *dentry, struct dentry *lower, struct dentry *upper); diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index bfef6edcc111..41655a7d6894 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -18,13 +18,21 @@ struct ovl_config { const char *redirect_mode; bool index; bool nfs_export; + int xino; +}; + +struct ovl_sb { + struct super_block *sb; + dev_t pseudo_dev; }; struct ovl_layer { struct vfsmount *mnt; - dev_t pseudo_dev; - /* Index of this layer in fs root (upper == 0) */ + struct ovl_sb *fs; + /* Index of this layer in fs root (upper idx == 0) */ int idx; + /* One fsid per unique underlying sb (upper fsid == 0) */ + int fsid; }; struct ovl_path { @@ -35,8 +43,11 @@ struct ovl_path { /* private information held for overlayfs's superblock */ struct ovl_fs { struct vfsmount *upper_mnt; - unsigned numlower; + unsigned int numlower; + /* Number of unique lower sb that differ from upper sb */ + unsigned int numlowerfs; struct ovl_layer *lower_layers; + struct ovl_sb *lower_fs; /* workbasedir is the path at workdir= mount option */ struct dentry *workbasedir; /* workdir is the 'work' directory under workbasedir */ @@ -50,11 +61,11 @@ struct ovl_fs { const struct cred *creator_cred; bool tmpfile; bool noxattr; - /* sb common to all layers */ - struct super_block *same_sb; /* Did we take the inuse lock? */ bool upperdir_locked; bool workdir_locked; + /* Inode numbers in all layers do not use the high xino_bits */ + unsigned int xino_bits; }; /* private information held for every overlayfs dentry */ diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index c11f5c0906c3..ef1fe42ff7bb 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -120,6 +120,10 @@ static bool ovl_calc_d_ino(struct ovl_readdir_data *rdd, if (!rdd->dentry) return false; + /* Always recalc d_ino when remapping lower inode numbers */ + if (ovl_xino_bits(rdd->dentry->d_sb)) + return true; + /* Always recalc d_ino for parent */ if (strcmp(p->name, "..") == 0) return true; @@ -435,6 +439,19 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) return cache; } +/* Map inode number to lower fs unique range */ +static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid, + const char *name, int namelen) +{ + if (ino >> (64 - xinobits)) { + pr_warn_ratelimited("overlayfs: d_ino too big (%.*s, ino=%llu, xinobits=%d)\n", + namelen, name, ino, xinobits); + return ino; + } + + return ino | ((u64)fsid) << (64 - xinobits); +} + /* * Set d_ino for upper entries. Non-upper entries should always report * the uppermost real inode ino and should not call this function. @@ -452,9 +469,10 @@ static int ovl_cache_update_ino(struct path *path, struct ovl_cache_entry *p) struct dentry *this = NULL; enum ovl_path_type type; u64 ino = p->real_ino; + int xinobits = ovl_xino_bits(dir->d_sb); int err = 0; - if (!ovl_same_sb(dir->d_sb)) + if (!ovl_same_sb(dir->d_sb) && !xinobits) goto out; if (p->name[0] == '.') { @@ -491,6 +509,10 @@ get: WARN_ON_ONCE(dir->d_sb->s_dev != stat.dev); ino = stat.ino; + } else if (xinobits && !OVL_TYPE_UPPER(type)) { + ino = ovl_remap_lower_ino(ino, xinobits, + ovl_layer_lower(this)->fsid, + p->name, p->len); } out: @@ -618,6 +640,8 @@ struct ovl_readdir_translate { struct ovl_dir_cache *cache; struct dir_context ctx; u64 parent_ino; + int fsid; + int xinobits; }; static int ovl_fill_real(struct dir_context *ctx, const char *name, @@ -628,14 +652,17 @@ static int ovl_fill_real(struct dir_context *ctx, const char *name, container_of(ctx, struct ovl_readdir_translate, ctx); struct dir_context *orig_ctx = rdt->orig_ctx; - if (rdt->parent_ino && strcmp(name, "..") == 0) + if (rdt->parent_ino && strcmp(name, "..") == 0) { ino = rdt->parent_ino; - else if (rdt->cache) { + } else if (rdt->cache) { struct ovl_cache_entry *p; p = ovl_cache_entry_find(&rdt->cache->root, name, namelen); if (p) ino = p->ino; + } else if (rdt->xinobits) { + ino = ovl_remap_lower_ino(ino, rdt->xinobits, rdt->fsid, + name, namelen); } return orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type); @@ -646,11 +673,16 @@ static int ovl_iterate_real(struct file *file, struct dir_context *ctx) int err; struct ovl_dir_file *od = file->private_data; struct dentry *dir = file->f_path.dentry; + struct ovl_layer *lower_layer = ovl_layer_lower(dir); struct ovl_readdir_translate rdt = { .ctx.actor = ovl_fill_real, .orig_ctx = ctx, + .xinobits = ovl_xino_bits(dir->d_sb), }; + if (rdt.xinobits && lower_layer) + rdt.fsid = lower_layer->fsid; + if (OVL_TYPE_MERGE(ovl_path_type(dir->d_parent))) { struct kstat stat; struct path statpath = file->f_path; @@ -693,9 +725,10 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx) * dir is impure then need to adjust d_ino for copied up * entries. */ - if (ovl_same_sb(dentry->d_sb) && - (ovl_test_flag(OVL_IMPURE, d_inode(dentry)) || - OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent)))) { + if (ovl_xino_bits(dentry->d_sb) || + (ovl_same_sb(dentry->d_sb) && + (ovl_test_flag(OVL_IMPURE, d_inode(dentry)) || + OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent))))) { return ovl_iterate_real(file, ctx); } return iterate_dir(od->realfile, ctx); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 7c24619ae7fc..e8551c97de51 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -17,6 +17,7 @@ #include <linux/statfs.h> #include <linux/seq_file.h> #include <linux/posix_acl_xattr.h> +#include <linux/exportfs.h> #include "overlayfs.h" MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); @@ -50,6 +51,11 @@ module_param_named(nfs_export, ovl_nfs_export_def, bool, 0644); MODULE_PARM_DESC(ovl_nfs_export_def, "Default to on or off for the NFS export feature"); +static bool ovl_xino_auto_def = IS_ENABLED(CONFIG_OVERLAY_FS_XINO_AUTO); +module_param_named(xino_auto, ovl_xino_auto_def, bool, 0644); +MODULE_PARM_DESC(ovl_xino_auto_def, + "Auto enable xino feature"); + static void ovl_entry_stack_free(struct ovl_entry *oe) { unsigned int i; @@ -236,11 +242,12 @@ static void ovl_free_fs(struct ovl_fs *ofs) if (ofs->upperdir_locked) ovl_inuse_unlock(ofs->upper_mnt->mnt_root); mntput(ofs->upper_mnt); - for (i = 0; i < ofs->numlower; i++) { + for (i = 0; i < ofs->numlower; i++) mntput(ofs->lower_layers[i].mnt); - free_anon_bdev(ofs->lower_layers[i].pseudo_dev); - } + for (i = 0; i < ofs->numlowerfs; i++) + free_anon_bdev(ofs->lower_fs[i].pseudo_dev); kfree(ofs->lower_layers); + kfree(ofs->lower_fs); kfree(ofs->config.lowerdir); kfree(ofs->config.upperdir); @@ -325,6 +332,23 @@ static const char *ovl_redirect_mode_def(void) return ovl_redirect_dir_def ? "on" : "off"; } +enum { + OVL_XINO_OFF, + OVL_XINO_AUTO, + OVL_XINO_ON, +}; + +static const char * const ovl_xino_str[] = { + "off", + "auto", + "on", +}; + +static inline int ovl_xino_def(void) +{ + return ovl_xino_auto_def ? OVL_XINO_AUTO : OVL_XINO_OFF; +} + /** * ovl_show_options * @@ -350,6 +374,8 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry) if (ofs->config.nfs_export != ovl_nfs_export_def) seq_printf(m, ",nfs_export=%s", ofs->config.nfs_export ? "on" : "off"); + if (ofs->config.xino != ovl_xino_def()) + seq_printf(m, ",xino=%s", ovl_xino_str[ofs->config.xino]); return 0; } @@ -384,6 +410,9 @@ enum { OPT_INDEX_OFF, OPT_NFS_EXPORT_ON, OPT_NFS_EXPORT_OFF, + OPT_XINO_ON, + OPT_XINO_OFF, + OPT_XINO_AUTO, OPT_ERR, }; @@ -397,6 +426,9 @@ static const match_table_t ovl_tokens = { {OPT_INDEX_OFF, "index=off"}, {OPT_NFS_EXPORT_ON, "nfs_export=on"}, {OPT_NFS_EXPORT_OFF, "nfs_export=off"}, + {OPT_XINO_ON, "xino=on"}, + {OPT_XINO_OFF, "xino=off"}, + {OPT_XINO_AUTO, "xino=auto"}, {OPT_ERR, NULL} }; @@ -511,6 +543,18 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) config->nfs_export = false; break; + case OPT_XINO_ON: + config->xino = OVL_XINO_ON; + break; + + case OPT_XINO_OFF: + config->xino = OVL_XINO_OFF; + break; + + case OPT_XINO_AUTO: + config->xino = OVL_XINO_AUTO; + break; + default: pr_err("overlayfs: unrecognized mount option \"%s\" or missing value\n", p); return -EINVAL; @@ -700,6 +744,7 @@ static int ovl_check_namelen(struct path *path, struct ovl_fs *ofs, static int ovl_lower_dir(const char *name, struct path *path, struct ovl_fs *ofs, int *stack_depth, bool *remote) { + int fh_type; int err; err = ovl_mount_dir_noesc(name, path); @@ -719,15 +764,19 @@ static int ovl_lower_dir(const char *name, struct path *path, * The inodes index feature and NFS export need to encode and decode * file handles, so they require that all layers support them. */ + fh_type = ovl_can_decode_fh(path->dentry->d_sb); if ((ofs->config.nfs_export || - (ofs->config.index && ofs->config.upperdir)) && - !ovl_can_decode_fh(path->dentry->d_sb)) { + (ofs->config.index && ofs->config.upperdir)) && !fh_type) { ofs->config.index = false; ofs->config.nfs_export = false; pr_warn("overlayfs: fs on '%s' does not support file handles, falling back to index=off,nfs_export=off.\n", name); } + /* Check if lower fs has 32bit inode numbers */ + if (fh_type != FILEID_INO32_GEN) + ofs->xino_bits = 0; + return 0; out_put: @@ -951,6 +1000,7 @@ static int ovl_make_workdir(struct ovl_fs *ofs, struct path *workpath) { struct vfsmount *mnt = ofs->upper_mnt; struct dentry *temp; + int fh_type; int err; err = mnt_want_write(mnt); @@ -1000,12 +1050,16 @@ static int ovl_make_workdir(struct ovl_fs *ofs, struct path *workpath) } /* Check if upper/work fs supports file handles */ - if (ofs->config.index && - !ovl_can_decode_fh(ofs->workdir->d_sb)) { + fh_type = ovl_can_decode_fh(ofs->workdir->d_sb); + if (ofs->config.index && !fh_type) { ofs->config.index = false; pr_warn("overlayfs: upper fs does not support file handles, falling back to index=off.\n"); } + /* Check if upper fs has 32bit inode numbers */ + if (fh_type != FILEID_INO32_GEN) + ofs->xino_bits = 0; + /* NFS export of r/w mount depends on index */ if (ofs->config.nfs_export && !ofs->config.index) { pr_warn("overlayfs: NFS export requires \"index=on\", falling back to nfs_export=off.\n"); @@ -1108,6 +1162,35 @@ out: return err; } +/* Get a unique fsid for the layer */ +static int ovl_get_fsid(struct ovl_fs *ofs, struct super_block *sb) +{ + unsigned int i; + dev_t dev; + int err; + + /* fsid 0 is reserved for upper fs even with non upper overlay */ + if (ofs->upper_mnt && ofs->upper_mnt->mnt_sb == sb) + return 0; + + for (i = 0; i < ofs->numlowerfs; i++) { + if (ofs->lower_fs[i].sb == sb) + return i + 1; + } + + err = get_anon_bdev(&dev); + if (err) { + pr_err("overlayfs: failed to get anonymous bdev for lowerpath\n"); + return err; + } + + ofs->lower_fs[ofs->numlowerfs].sb = sb; + ofs->lower_fs[ofs->numlowerfs].pseudo_dev = dev; + ofs->numlowerfs++; + + return ofs->numlowerfs; +} + static int ovl_get_lower_layers(struct ovl_fs *ofs, struct path *stack, unsigned int numlower) { @@ -1119,23 +1202,27 @@ static int ovl_get_lower_layers(struct ovl_fs *ofs, struct path *stack, GFP_KERNEL); if (ofs->lower_layers == NULL) goto out; + + ofs->lower_fs = kcalloc(numlower, sizeof(struct ovl_sb), + GFP_KERNEL); + if (ofs->lower_fs == NULL) + goto out; + for (i = 0; i < numlower; i++) { struct vfsmount *mnt; - dev_t dev; + int fsid; - err = get_anon_bdev(&dev); - if (err) { - pr_err("overlayfs: failed to get anonymous bdev for lowerpath\n"); + err = fsid = ovl_get_fsid(ofs, stack[i].mnt->mnt_sb); + if (err < 0) goto out; - } mnt = clone_private_mount(&stack[i]); err = PTR_ERR(mnt); if (IS_ERR(mnt)) { pr_err("overlayfs: failed to clone lowerpath\n"); - free_anon_bdev(dev); goto out; } + /* * Make lower layers R/O. That way fchmod/fchown on lower file * will fail instead of modifying lower fs. @@ -1143,16 +1230,41 @@ static int ovl_get_lower_layers(struct ovl_fs *ofs, struct path *stack, mnt->mnt_flags |= MNT_READONLY | MNT_NOATIME; ofs->lower_layers[ofs->numlower].mnt = mnt; - ofs->lower_layers[ofs->numlower].pseudo_dev = dev; ofs->lower_layers[ofs->numlower].idx = i + 1; + ofs->lower_layers[ofs->numlower].fsid = fsid; + if (fsid) { + ofs->lower_layers[ofs->numlower].fs = + &ofs->lower_fs[fsid - 1]; + } ofs->numlower++; + } + + /* + * When all layers on same fs, overlay can use real inode numbers. + * With mount option "xino=on", mounter declares that there are enough + * free high bits in underlying fs to hold the unique fsid. + * If overlayfs does encounter underlying inodes using the high xino + * bits reserved for fsid, it emits a warning and uses the original + * inode number. + */ + if (!ofs->numlowerfs || (ofs->numlowerfs == 1 && !ofs->upper_mnt)) { + ofs->xino_bits = 0; + ofs->config.xino = OVL_XINO_OFF; + } else if (ofs->config.xino == OVL_XINO_ON && !ofs->xino_bits) { + /* + * This is a roundup of number of bits needed for numlowerfs+1 + * (i.e. ilog2(numlowerfs+1 - 1) + 1). fsid 0 is reserved for + * upper fs even with non upper overlay. + */ + BUILD_BUG_ON(ilog2(OVL_MAX_STACK) > 31); + ofs->xino_bits = ilog2(ofs->numlowerfs) + 1; + } - /* Check if all lower layers are on same sb */ - if (i == 0) - ofs->same_sb = mnt->mnt_sb; - else if (ofs->same_sb != mnt->mnt_sb) - ofs->same_sb = NULL; + if (ofs->xino_bits) { + pr_info("overlayfs: \"xino\" feature enabled using %d upper inode bits.\n", + ofs->xino_bits); } + err = 0; out: return err; @@ -1263,6 +1375,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) ofs->config.index = ovl_index_def; ofs->config.nfs_export = ovl_nfs_export_def; + ofs->config.xino = ovl_xino_def(); err = ovl_parse_opt((char *) data, &ofs->config); if (err) goto out_err; @@ -1276,6 +1389,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_stack_depth = 0; sb->s_maxbytes = MAX_LFS_FILESIZE; + /* Assume underlaying fs uses 32bit inodes unless proven otherwise */ + if (ofs->config.xino != OVL_XINO_OFF) + ofs->xino_bits = BITS_PER_LONG - 32; + if (ofs->config.upperdir) { if (!ofs->config.workdir) { pr_err("overlayfs: missing 'workdir'\n"); @@ -1305,8 +1422,6 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) /* If the upper fs is nonexistent, we mark overlayfs r/o too */ if (!ofs->upper_mnt) sb->s_flags |= SB_RDONLY; - else if (ofs->upper_mnt->mnt_sb != ofs->same_sb) - ofs->same_sb = NULL; if (!(ovl_force_readonly(ofs)) && ofs->config.index) { err = ovl_get_indexdir(ofs, oe, &upperpath); diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 930784a26623..6f1078028c66 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -47,13 +47,29 @@ struct super_block *ovl_same_sb(struct super_block *sb) { struct ovl_fs *ofs = sb->s_fs_info; - return ofs->same_sb; + if (!ofs->numlowerfs) + return ofs->upper_mnt->mnt_sb; + else if (ofs->numlowerfs == 1 && !ofs->upper_mnt) + return ofs->lower_fs[0].sb; + else + return NULL; } -bool ovl_can_decode_fh(struct super_block *sb) +/* + * Check if underlying fs supports file handles and try to determine encoding + * type, in order to deduce maximum inode number used by fs. + * + * Return 0 if file handles are not supported. + * Return 1 (FILEID_INO32_GEN) if fs uses the default 32bit inode encoding. + * Return -1 if fs uses a non default encoding with unknown inode size. + */ +int ovl_can_decode_fh(struct super_block *sb) { - return (sb->s_export_op && sb->s_export_op->fh_to_dentry && - !uuid_is_null(&sb->s_uuid)); + if (!sb->s_export_op || !sb->s_export_op->fh_to_dentry || + uuid_is_null(&sb->s_uuid)) + return 0; + + return sb->s_export_op->encode_fh ? -1 : FILEID_INO32_GEN; } struct dentry *ovl_indexdir(struct super_block *sb) @@ -172,6 +188,13 @@ struct dentry *ovl_dentry_lower(struct dentry *dentry) return oe->numlower ? oe->lowerstack[0].dentry : NULL; } +struct ovl_layer *ovl_layer_lower(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return oe->numlower ? oe->lowerstack[0].layer : NULL; +} + struct dentry *ovl_dentry_real(struct dentry *dentry) { return ovl_dentry_upper(dentry) ?: ovl_dentry_lower(dentry); @@ -279,12 +302,16 @@ void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect) void ovl_inode_init(struct inode *inode, struct dentry *upperdentry, struct dentry *lowerdentry) { + struct inode *realinode = d_inode(upperdentry ?: lowerdentry); + if (upperdentry) OVL_I(inode)->__upperdentry = upperdentry; if (lowerdentry) OVL_I(inode)->lower = igrab(d_inode(lowerdentry)); - ovl_copyattr(d_inode(upperdentry ?: lowerdentry), inode); + ovl_copyattr(realinode, inode); + if (!inode->i_ino) + inode->i_ino = realinode->i_ino; } void ovl_inode_update(struct inode *inode, struct dentry *upperdentry) @@ -299,6 +326,8 @@ void ovl_inode_update(struct inode *inode, struct dentry *upperdentry) smp_wmb(); OVL_I(inode)->__upperdentry = upperdentry; if (inode_unhashed(inode)) { + if (!inode->i_ino) + inode->i_ino = upperinode->i_ino; inode->i_private = upperinode; __insert_inode_hash(inode, (unsigned long) upperinode); } diff --git a/fs/proc/array.c b/fs/proc/array.c index 598803576e4c..ae2c807fd719 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -141,25 +141,12 @@ static inline const char *get_task_state(struct task_struct *tsk) return task_state_array[task_state_index(tsk)]; } -static inline int get_task_umask(struct task_struct *tsk) -{ - struct fs_struct *fs; - int umask = -ENOENT; - - task_lock(tsk); - fs = tsk->fs; - if (fs) - umask = fs->umask; - task_unlock(tsk); - return umask; -} - static inline void task_state(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *p) { struct user_namespace *user_ns = seq_user_ns(m); struct group_info *group_info; - int g, umask; + int g, umask = -1; struct task_struct *tracer; const struct cred *cred; pid_t ppid, tpid = 0, tgid, ngid; @@ -177,17 +164,18 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, ngid = task_numa_group_id(p); cred = get_task_cred(p); - umask = get_task_umask(p); - if (umask >= 0) - seq_printf(m, "Umask:\t%#04o\n", umask); - task_lock(p); + if (p->fs) + umask = p->fs->umask; if (p->files) max_fds = files_fdtable(p->files)->max_fds; task_unlock(p); rcu_read_unlock(); - seq_printf(m, "State:\t%s", get_task_state(p)); + if (umask >= 0) + seq_printf(m, "Umask:\t%#04o\n", umask); + seq_puts(m, "State:\t"); + seq_puts(m, get_task_state(p)); seq_put_decimal_ull(m, "\nTgid:\t", tgid); seq_put_decimal_ull(m, "\nNgid:\t", ngid); @@ -313,8 +301,8 @@ static void render_cap_t(struct seq_file *m, const char *header, seq_puts(m, header); CAP_FOR_EACH_U32(__capi) { - seq_printf(m, "%08x", - a->cap[CAP_LAST_U32 - __capi]); + seq_put_hex_ll(m, NULL, + a->cap[CAP_LAST_U32 - __capi], 8); } seq_putc(m, '\n'); } @@ -368,7 +356,8 @@ static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm) { - seq_printf(m, "CoreDumping:\t%d\n", !!mm->core_state); + seq_put_decimal_ull(m, "CoreDumping:\t", !!mm->core_state); + seq_putc(m, '\n'); } int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, @@ -504,7 +493,11 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, /* convert nsec -> ticks */ start_time = nsec_to_clock_t(task->real_start_time); - seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state); + seq_put_decimal_ull(m, "", pid_nr_ns(pid, ns)); + seq_puts(m, " ("); + seq_puts(m, tcomm); + seq_puts(m, ") "); + seq_putc(m, state); seq_put_decimal_ll(m, " ", ppid); seq_put_decimal_ll(m, " ", pgid); seq_put_decimal_ll(m, " ", sid); diff --git a/fs/proc/base.c b/fs/proc/base.c index d53246863cfb..eafa39a3a88c 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -388,14 +388,17 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, unsigned long wchan; char symname[KSYM_NAME_LEN]; - wchan = get_wchan(task); + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) + goto print0; - if (wchan && ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS) - && !lookup_symbol_name(wchan, symname)) - seq_printf(m, "%s", symname); - else - seq_putc(m, '0'); + wchan = get_wchan(task); + if (wchan && !lookup_symbol_name(wchan, symname)) { + seq_puts(m, symname); + return 0; + } +print0: + seq_putc(m, '0'); return 0; } #endif /* CONFIG_KALLSYMS */ @@ -1910,6 +1913,8 @@ static int dname_to_vma_addr(struct dentry *dentry, unsigned long long sval, eval; unsigned int len; + if (str[0] == '0' && str[1] != '-') + return -EINVAL; len = _parse_integer(str, 16, &sval); if (len & KSTRTOX_OVERFLOW) return -EINVAL; @@ -1921,6 +1926,8 @@ static int dname_to_vma_addr(struct dentry *dentry, return -EINVAL; str++; + if (str[0] == '0' && str[1]) + return -EINVAL; len = _parse_integer(str, 16, &eval); if (len & KSTRTOX_OVERFLOW) return -EINVAL; @@ -2204,6 +2211,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) } } up_read(&mm->mmap_sem); + mmput(mm); for (i = 0; i < nr_files; i++) { char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */ @@ -2221,7 +2229,6 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) } if (fa) flex_array_free(fa); - mmput(mm); out_put_task: put_task_struct(task); diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c index 403cbb12a6e9..8233e7af9389 100644 --- a/fs/proc/cmdline.c +++ b/fs/proc/cmdline.c @@ -6,7 +6,8 @@ static int cmdline_proc_show(struct seq_file *m, void *v) { - seq_printf(m, "%s\n", saved_command_line); + seq_puts(m, saved_command_line); + seq_putc(m, '\n'); return 0; } diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 5d709fa8f3a2..2078e70e1595 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -8,12 +8,14 @@ * Copyright (C) 1997 Theodore Ts'o */ +#include <linux/cache.h> #include <linux/errno.h> #include <linux/time.h> #include <linux/proc_fs.h> #include <linux/stat.h> #include <linux/mm.h> #include <linux/module.h> +#include <linux/namei.h> #include <linux/slab.h> #include <linux/printk.h> #include <linux/mount.h> @@ -28,6 +30,17 @@ static DEFINE_RWLOCK(proc_subdir_lock); +struct kmem_cache *proc_dir_entry_cache __ro_after_init; + +void pde_free(struct proc_dir_entry *pde) +{ + if (S_ISLNK(pde->mode)) + kfree(pde->data); + if (pde->name != pde->inline_name) + kfree(pde->name); + kmem_cache_free(proc_dir_entry_cache, pde); +} + static int proc_match(const char *name, struct proc_dir_entry *de, unsigned int len) { if (len < de->namelen) @@ -40,8 +53,8 @@ static int proc_match(const char *name, struct proc_dir_entry *de, unsigned int static struct proc_dir_entry *pde_subdir_first(struct proc_dir_entry *dir) { - return rb_entry_safe(rb_first_cached(&dir->subdir), - struct proc_dir_entry, subdir_node); + return rb_entry_safe(rb_first(&dir->subdir), struct proc_dir_entry, + subdir_node); } static struct proc_dir_entry *pde_subdir_next(struct proc_dir_entry *dir) @@ -54,7 +67,7 @@ static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir, const char *name, unsigned int len) { - struct rb_node *node = dir->subdir.rb_root.rb_node; + struct rb_node *node = dir->subdir.rb_node; while (node) { struct proc_dir_entry *de = rb_entry(node, @@ -75,9 +88,8 @@ static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir, static bool pde_subdir_insert(struct proc_dir_entry *dir, struct proc_dir_entry *de) { - struct rb_root_cached *root = &dir->subdir; - struct rb_node **new = &root->rb_root.rb_node, *parent = NULL; - bool leftmost = true; + struct rb_root *root = &dir->subdir; + struct rb_node **new = &root->rb_node, *parent = NULL; /* Figure out where to put new node */ while (*new) { @@ -89,16 +101,15 @@ static bool pde_subdir_insert(struct proc_dir_entry *dir, parent = *new; if (result < 0) new = &(*new)->rb_left; - else if (result > 0) { + else if (result > 0) new = &(*new)->rb_right; - leftmost = false; - } else + else return false; } /* Add new node and rebalance tree. */ rb_link_node(&de->subdir_node, parent, new); - rb_insert_color_cached(&de->subdir_node, root, leftmost); + rb_insert_color(&de->subdir_node, root); return true; } @@ -207,6 +218,26 @@ void proc_free_inum(unsigned int inum) ida_simple_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); } +static int proc_misc_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + if (flags & LOOKUP_RCU) + return -ECHILD; + + if (atomic_read(&PDE(d_inode(dentry))->in_use) < 0) + return 0; /* revalidate */ + return 1; +} + +static int proc_misc_d_delete(const struct dentry *dentry) +{ + return atomic_read(&PDE(d_inode(dentry))->in_use) < 0; +} + +static const struct dentry_operations proc_misc_dentry_ops = { + .d_revalidate = proc_misc_d_revalidate, + .d_delete = proc_misc_d_delete, +}; + /* * Don't create negative dentries here, return -ENOENT by hand * instead. @@ -224,7 +255,7 @@ struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry, inode = proc_get_inode(dir->i_sb, de); if (!inode) return ERR_PTR(-ENOMEM); - d_set_d_op(dentry, &simple_dentry_operations); + d_set_d_op(dentry, &proc_misc_dentry_ops); d_add(dentry, inode); return NULL; } @@ -354,6 +385,14 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, WARN(1, "name len %u\n", qstr.len); return NULL; } + if (qstr.len == 1 && fn[0] == '.') { + WARN(1, "name '.'\n"); + return NULL; + } + if (qstr.len == 2 && fn[0] == '.' && fn[1] == '.') { + WARN(1, "name '..'\n"); + return NULL; + } if (*parent == &proc_root && name_to_int(&qstr) != ~0U) { WARN(1, "create '/proc/%s' by hand\n", qstr.name); return NULL; @@ -363,16 +402,26 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, return NULL; } - ent = kzalloc(sizeof(struct proc_dir_entry) + qstr.len + 1, GFP_KERNEL); + ent = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL); if (!ent) goto out; + if (qstr.len + 1 <= sizeof(ent->inline_name)) { + ent->name = ent->inline_name; + } else { + ent->name = kmalloc(qstr.len + 1, GFP_KERNEL); + if (!ent->name) { + pde_free(ent); + return NULL; + } + } + memcpy(ent->name, fn, qstr.len + 1); ent->namelen = qstr.len; ent->mode = mode; ent->nlink = nlink; - ent->subdir = RB_ROOT_CACHED; - atomic_set(&ent->count, 1); + ent->subdir = RB_ROOT; + refcount_set(&ent->refcnt, 1); spin_lock_init(&ent->pde_unload_lock); INIT_LIST_HEAD(&ent->pde_openers); proc_set_user(ent, (*parent)->uid, (*parent)->gid); @@ -395,12 +444,11 @@ struct proc_dir_entry *proc_symlink(const char *name, strcpy((char*)ent->data,dest); ent->proc_iops = &proc_link_inode_operations; if (proc_register(parent, ent) < 0) { - kfree(ent->data); - kfree(ent); + pde_free(ent); ent = NULL; } } else { - kfree(ent); + pde_free(ent); ent = NULL; } } @@ -423,7 +471,7 @@ struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode, ent->proc_iops = &proc_dir_inode_operations; parent->nlink++; if (proc_register(parent, ent) < 0) { - kfree(ent); + pde_free(ent); parent->nlink--; ent = NULL; } @@ -458,7 +506,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name) ent->proc_iops = NULL; parent->nlink++; if (proc_register(parent, ent) < 0) { - kfree(ent); + pde_free(ent); parent->nlink--; ent = NULL; } @@ -495,7 +543,7 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, goto out_free; return pde; out_free: - kfree(pde); + pde_free(pde); out: return NULL; } @@ -522,19 +570,12 @@ void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid) } EXPORT_SYMBOL(proc_set_user); -static void free_proc_entry(struct proc_dir_entry *de) -{ - proc_free_inum(de->low_ino); - - if (S_ISLNK(de->mode)) - kfree(de->data); - kfree(de); -} - void pde_put(struct proc_dir_entry *pde) { - if (atomic_dec_and_test(&pde->count)) - free_proc_entry(pde); + if (refcount_dec_and_test(&pde->refcnt)) { + proc_free_inum(pde->low_ino); + pde_free(pde); + } } /* @@ -555,7 +596,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) de = pde_subdir_find(parent, fn, len); if (de) - rb_erase_cached(&de->subdir_node, &parent->subdir); + rb_erase(&de->subdir_node, &parent->subdir); write_unlock(&proc_subdir_lock); if (!de) { WARN(1, "name '%s'\n", name); @@ -592,13 +633,13 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) write_unlock(&proc_subdir_lock); return -ENOENT; } - rb_erase_cached(&root->subdir_node, &parent->subdir); + rb_erase(&root->subdir_node, &parent->subdir); de = root; while (1) { next = pde_subdir_first(de); if (next) { - rb_erase_cached(&next->subdir_node, &de->subdir); + rb_erase(&next->subdir_node, &de->subdir); de = next; continue; } diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 6e8724958116..2cf3b74391ca 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -54,6 +54,7 @@ static void proc_evict_inode(struct inode *inode) } static struct kmem_cache *proc_inode_cachep __ro_after_init; +static struct kmem_cache *pde_opener_cache __ro_after_init; static struct inode *proc_alloc_inode(struct super_block *sb) { @@ -92,7 +93,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -void __init proc_init_inodecache(void) +void __init proc_init_kmemcache(void) { proc_inode_cachep = kmem_cache_create("proc_inode_cache", sizeof(struct proc_inode), @@ -100,6 +101,13 @@ void __init proc_init_inodecache(void) SLAB_MEM_SPREAD|SLAB_ACCOUNT| SLAB_PANIC), init_once); + pde_opener_cache = + kmem_cache_create("pde_opener", sizeof(struct pde_opener), 0, + SLAB_ACCOUNT|SLAB_PANIC, NULL); + proc_dir_entry_cache = kmem_cache_create_usercopy( + "proc_dir_entry", sizeof(struct proc_dir_entry), 0, SLAB_PANIC, + offsetof(struct proc_dir_entry, inline_name), + sizeof_field(struct proc_dir_entry, inline_name), NULL); } static int proc_show_options(struct seq_file *seq, struct dentry *root) @@ -138,7 +146,7 @@ static void unuse_pde(struct proc_dir_entry *pde) complete(pde->pde_unload_completion); } -/* pde is locked */ +/* pde is locked on entry, unlocked on exit */ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) { /* @@ -157,9 +165,10 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) pdeo->c = &c; spin_unlock(&pde->pde_unload_lock); wait_for_completion(&c); - spin_lock(&pde->pde_unload_lock); } else { struct file *file; + struct completion *c; + pdeo->closing = true; spin_unlock(&pde->pde_unload_lock); file = pdeo->file; @@ -167,9 +176,11 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) spin_lock(&pde->pde_unload_lock); /* After ->release. */ list_del(&pdeo->lh); - if (unlikely(pdeo->c)) - complete(pdeo->c); - kfree(pdeo); + c = pdeo->c; + spin_unlock(&pde->pde_unload_lock); + if (unlikely(c)) + complete(c); + kmem_cache_free(pde_opener_cache, pdeo); } } @@ -188,6 +199,7 @@ void proc_entry_rundown(struct proc_dir_entry *de) struct pde_opener *pdeo; pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh); close_pdeo(de, pdeo); + spin_lock(&de->pde_unload_lock); } spin_unlock(&de->pde_unload_lock); } @@ -338,31 +350,36 @@ static int proc_reg_open(struct inode *inode, struct file *file) * * Save every "struct file" with custom ->release hook. */ - pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL); - if (!pdeo) - return -ENOMEM; - - if (!use_pde(pde)) { - kfree(pdeo); + if (!use_pde(pde)) return -ENOENT; - } - open = pde->proc_fops->open; + release = pde->proc_fops->release; + if (release) { + pdeo = kmem_cache_alloc(pde_opener_cache, GFP_KERNEL); + if (!pdeo) { + rv = -ENOMEM; + goto out_unuse; + } + } + open = pde->proc_fops->open; if (open) rv = open(inode, file); - if (rv == 0 && release) { - /* To know what to release. */ - pdeo->file = file; - pdeo->closing = false; - pdeo->c = NULL; - spin_lock(&pde->pde_unload_lock); - list_add(&pdeo->lh, &pde->pde_openers); - spin_unlock(&pde->pde_unload_lock); - } else - kfree(pdeo); + if (release) { + if (rv == 0) { + /* To know what to release. */ + pdeo->file = file; + pdeo->closing = false; + pdeo->c = NULL; + spin_lock(&pde->pde_unload_lock); + list_add(&pdeo->lh, &pde->pde_openers); + spin_unlock(&pde->pde_unload_lock); + } else + kmem_cache_free(pde_opener_cache, pdeo); + } +out_unuse: unuse_pde(pde); return rv; } @@ -375,7 +392,7 @@ static int proc_reg_release(struct inode *inode, struct file *file) list_for_each_entry(pdeo, &pde->pde_openers, lh) { if (pdeo->file == file) { close_pdeo(pde, pdeo); - break; + return 0; } } spin_unlock(&pde->pde_unload_lock); diff --git a/fs/proc/internal.h b/fs/proc/internal.h index d697c8ab0a14..0f1692e63cb6 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -11,6 +11,7 @@ #include <linux/proc_fs.h> #include <linux/proc_ns.h> +#include <linux/refcount.h> #include <linux/spinlock.h> #include <linux/atomic.h> #include <linux/binfmts.h> @@ -36,7 +37,7 @@ struct proc_dir_entry { * negative -> it's going away RSN */ atomic_t in_use; - atomic_t count; /* use count */ + refcount_t refcnt; struct list_head pde_openers; /* who did ->open, but not ->release */ /* protects ->pde_openers and all struct pde_opener instances */ spinlock_t pde_unload_lock; @@ -50,13 +51,22 @@ struct proc_dir_entry { kgid_t gid; loff_t size; struct proc_dir_entry *parent; - struct rb_root_cached subdir; + struct rb_root subdir; struct rb_node subdir_node; + char *name; umode_t mode; u8 namelen; - char name[]; +#ifdef CONFIG_64BIT +#define SIZEOF_PDE_INLINE_NAME (192-139) +#else +#define SIZEOF_PDE_INLINE_NAME (128-87) +#endif + char inline_name[SIZEOF_PDE_INLINE_NAME]; } __randomize_layout; +extern struct kmem_cache *proc_dir_entry_cache; +void pde_free(struct proc_dir_entry *pde); + union proc_op { int (*proc_get_link)(struct dentry *, struct path *); int (*proc_show)(struct seq_file *m, @@ -159,7 +169,7 @@ int proc_readdir_de(struct file *, struct dir_context *, struct proc_dir_entry * static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) { - atomic_inc(&pde->count); + refcount_inc(&pde->refcnt); return pde; } extern void pde_put(struct proc_dir_entry *); @@ -177,12 +187,12 @@ struct pde_opener { struct list_head lh; bool closing; struct completion *c; -}; +} __randomize_layout; extern const struct inode_operations proc_link_inode_operations; extern const struct inode_operations proc_pid_link_inode_operations; -extern void proc_init_inodecache(void); +void proc_init_kmemcache(void); void set_proc_pid_nlink(void); extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); extern int proc_fill_super(struct super_block *, void *data, int flags); diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 6bb20f864259..65a72ab57471 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -26,20 +26,7 @@ void __attribute__((weak)) arch_report_meminfo(struct seq_file *m) static void show_val_kb(struct seq_file *m, const char *s, unsigned long num) { - char v[32]; - static const char blanks[7] = {' ', ' ', ' ', ' ',' ', ' ', ' '}; - int len; - - len = num_to_str(v, sizeof(v), num << (PAGE_SHIFT - 10)); - - seq_write(m, s, 16); - - if (len > 0) { - if (len < 8) - seq_write(m, blanks, 8 - len); - - seq_write(m, v, len); - } + seq_put_decimal_ull_width(m, s, num << (PAGE_SHIFT - 10), 8); seq_write(m, " kB\n", 4); } diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 68c06ae7888c..1763f370489d 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -192,15 +192,16 @@ static __net_init int proc_net_ns_init(struct net *net) int err; err = -ENOMEM; - netd = kzalloc(sizeof(*netd) + 4, GFP_KERNEL); + netd = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL); if (!netd) goto out; - netd->subdir = RB_ROOT_CACHED; + netd->subdir = RB_ROOT; netd->data = net; netd->nlink = 2; netd->namelen = 3; netd->parent = &proc_root; + netd->name = netd->inline_name; memcpy(netd->name, "net", 4); uid = make_kuid(net->user_ns, 0); @@ -223,7 +224,7 @@ static __net_init int proc_net_ns_init(struct net *net) return 0; free_net: - kfree(netd); + pde_free(netd); out: return err; } @@ -231,7 +232,7 @@ out: static __net_exit void proc_net_ns_exit(struct net *net) { remove_proc_entry("stat", net->proc_net); - kfree(net->proc_net); + pde_free(net->proc_net); } static struct pernet_operations __net_initdata proc_net_ns_ops = { diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index c41ab261397d..8989936f2995 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -707,14 +707,14 @@ static bool proc_sys_link_fill_cache(struct file *file, struct ctl_table *table) { bool ret = true; + head = sysctl_head_grab(head); + if (IS_ERR(head)) + return false; - if (S_ISLNK(table->mode)) { - /* It is not an error if we can not follow the link ignore it */ - int err = sysctl_follow_link(&head, &table); - if (err) - goto out; - } + /* It is not an error if we can not follow the link ignore it */ + if (sysctl_follow_link(&head, &table)) + goto out; ret = proc_sys_fill_cache(file, ctx, head, table); out: @@ -1086,7 +1086,7 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table) if ((table->proc_handler == proc_douintvec) || (table->proc_handler == proc_douintvec_minmax)) { if (table->maxlen != sizeof(unsigned int)) - err |= sysctl_err(path, table, "array now allowed"); + err |= sysctl_err(path, table, "array not allowed"); } return err; diff --git a/fs/proc/root.c b/fs/proc/root.c index ede8e64974be..61b7340b357a 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -123,23 +123,13 @@ static struct file_system_type proc_fs_type = { void __init proc_root_init(void) { - int err; - - proc_init_inodecache(); + proc_init_kmemcache(); set_proc_pid_nlink(); - err = register_filesystem(&proc_fs_type); - if (err) - return; - proc_self_init(); proc_thread_self_init(); proc_symlink("mounts", NULL, "self/mounts"); proc_net_init(); - -#ifdef CONFIG_SYSVIPC - proc_mkdir("sysvipc", NULL); -#endif proc_mkdir("fs", NULL); proc_mkdir("driver", NULL); proc_create_mount_point("fs/nfsd"); /* somewhere for the nfsd filesystem to be mounted */ @@ -150,6 +140,8 @@ void __init proc_root_init(void) proc_tty_init(); proc_mkdir("bus", NULL); proc_sys_init(); + + register_filesystem(&proc_fs_type); } static int proc_root_getattr(const struct path *path, struct kstat *stat, @@ -207,12 +199,13 @@ struct proc_dir_entry proc_root = { .namelen = 5, .mode = S_IFDIR | S_IRUGO | S_IXUGO, .nlink = 2, - .count = ATOMIC_INIT(1), + .refcnt = REFCOUNT_INIT(1), .proc_iops = &proc_root_inode_operations, .proc_fops = &proc_root_operations, .parent = &proc_root, - .subdir = RB_ROOT_CACHED, - .name = "/proc", + .subdir = RB_ROOT, + .name = proc_root.inline_name, + .inline_name = "/proc", }; int pid_ns_prepare_proc(struct pid_namespace *ns) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ec6d2983a5cb..65ae54659833 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -24,6 +24,8 @@ #include <asm/tlbflush.h> #include "internal.h" +#define SEQ_PUT_DEC(str, val) \ + seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8) void task_mem(struct seq_file *m, struct mm_struct *mm) { unsigned long text, lib, swap, anon, file, shmem; @@ -53,39 +55,28 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) lib = (mm->exec_vm << PAGE_SHIFT) - text; swap = get_mm_counter(mm, MM_SWAPENTS); - seq_printf(m, - "VmPeak:\t%8lu kB\n" - "VmSize:\t%8lu kB\n" - "VmLck:\t%8lu kB\n" - "VmPin:\t%8lu kB\n" - "VmHWM:\t%8lu kB\n" - "VmRSS:\t%8lu kB\n" - "RssAnon:\t%8lu kB\n" - "RssFile:\t%8lu kB\n" - "RssShmem:\t%8lu kB\n" - "VmData:\t%8lu kB\n" - "VmStk:\t%8lu kB\n" - "VmExe:\t%8lu kB\n" - "VmLib:\t%8lu kB\n" - "VmPTE:\t%8lu kB\n" - "VmSwap:\t%8lu kB\n", - hiwater_vm << (PAGE_SHIFT-10), - total_vm << (PAGE_SHIFT-10), - mm->locked_vm << (PAGE_SHIFT-10), - mm->pinned_vm << (PAGE_SHIFT-10), - hiwater_rss << (PAGE_SHIFT-10), - total_rss << (PAGE_SHIFT-10), - anon << (PAGE_SHIFT-10), - file << (PAGE_SHIFT-10), - shmem << (PAGE_SHIFT-10), - mm->data_vm << (PAGE_SHIFT-10), - mm->stack_vm << (PAGE_SHIFT-10), - text >> 10, - lib >> 10, - mm_pgtables_bytes(mm) >> 10, - swap << (PAGE_SHIFT-10)); + SEQ_PUT_DEC("VmPeak:\t", hiwater_vm); + SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm); + SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm); + SEQ_PUT_DEC(" kB\nVmPin:\t", mm->pinned_vm); + SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss); + SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss); + SEQ_PUT_DEC(" kB\nRssAnon:\t", anon); + SEQ_PUT_DEC(" kB\nRssFile:\t", file); + SEQ_PUT_DEC(" kB\nRssShmem:\t", shmem); + SEQ_PUT_DEC(" kB\nVmData:\t", mm->data_vm); + SEQ_PUT_DEC(" kB\nVmStk:\t", mm->stack_vm); + seq_put_decimal_ull_width(m, + " kB\nVmExe:\t", text >> 10, 8); + seq_put_decimal_ull_width(m, + " kB\nVmLib:\t", lib >> 10, 8); + seq_put_decimal_ull_width(m, + " kB\nVmPTE:\t", mm_pgtables_bytes(mm) >> 10, 8); + SEQ_PUT_DEC(" kB\nVmSwap:\t", swap); + seq_puts(m, " kB\n"); hugetlb_report_usage(m, mm); } +#undef SEQ_PUT_DEC unsigned long task_vsize(struct mm_struct *mm) { @@ -287,15 +278,18 @@ static void show_vma_header_prefix(struct seq_file *m, dev_t dev, unsigned long ino) { seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); - seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", - start, - end, - flags & VM_READ ? 'r' : '-', - flags & VM_WRITE ? 'w' : '-', - flags & VM_EXEC ? 'x' : '-', - flags & VM_MAYSHARE ? 's' : 'p', - pgoff, - MAJOR(dev), MINOR(dev), ino); + seq_put_hex_ll(m, NULL, start, 8); + seq_put_hex_ll(m, "-", end, 8); + seq_putc(m, ' '); + seq_putc(m, flags & VM_READ ? 'r' : '-'); + seq_putc(m, flags & VM_WRITE ? 'w' : '-'); + seq_putc(m, flags & VM_EXEC ? 'x' : '-'); + seq_putc(m, flags & VM_MAYSHARE ? 's' : 'p'); + seq_put_hex_ll(m, " ", pgoff, 8); + seq_put_hex_ll(m, " ", MAJOR(dev), 2); + seq_put_hex_ll(m, ":", MINOR(dev), 2); + seq_put_decimal_ull(m, " ", ino); + seq_putc(m, ' '); } static void @@ -694,8 +688,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) if (!mnemonics[i][0]) continue; if (vma->vm_flags & (1UL << i)) { - seq_printf(m, "%c%c ", - mnemonics[i][0], mnemonics[i][1]); + seq_putc(m, mnemonics[i][0]); + seq_putc(m, mnemonics[i][1]); + seq_putc(m, ' '); } } seq_putc(m, '\n'); @@ -736,6 +731,8 @@ void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma) { } +#define SEQ_PUT_DEC(str, val) \ + seq_put_decimal_ull_width(m, str, (val) >> 10, 8) static int show_smap(struct seq_file *m, void *v, int is_pid) { struct proc_maps_private *priv = m->private; @@ -809,51 +806,34 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) ret = SEQ_SKIP; } - if (!rollup_mode) - seq_printf(m, - "Size: %8lu kB\n" - "KernelPageSize: %8lu kB\n" - "MMUPageSize: %8lu kB\n", - (vma->vm_end - vma->vm_start) >> 10, - vma_kernel_pagesize(vma) >> 10, - vma_mmu_pagesize(vma) >> 10); - - - if (!rollup_mode || last_vma) - seq_printf(m, - "Rss: %8lu kB\n" - "Pss: %8lu kB\n" - "Shared_Clean: %8lu kB\n" - "Shared_Dirty: %8lu kB\n" - "Private_Clean: %8lu kB\n" - "Private_Dirty: %8lu kB\n" - "Referenced: %8lu kB\n" - "Anonymous: %8lu kB\n" - "LazyFree: %8lu kB\n" - "AnonHugePages: %8lu kB\n" - "ShmemPmdMapped: %8lu kB\n" - "Shared_Hugetlb: %8lu kB\n" - "Private_Hugetlb: %7lu kB\n" - "Swap: %8lu kB\n" - "SwapPss: %8lu kB\n" - "Locked: %8lu kB\n", - mss->resident >> 10, - (unsigned long)(mss->pss >> (10 + PSS_SHIFT)), - mss->shared_clean >> 10, - mss->shared_dirty >> 10, - mss->private_clean >> 10, - mss->private_dirty >> 10, - mss->referenced >> 10, - mss->anonymous >> 10, - mss->lazyfree >> 10, - mss->anonymous_thp >> 10, - mss->shmem_thp >> 10, - mss->shared_hugetlb >> 10, - mss->private_hugetlb >> 10, - mss->swap >> 10, - (unsigned long)(mss->swap_pss >> (10 + PSS_SHIFT)), - (unsigned long)(mss->pss >> (10 + PSS_SHIFT))); + if (!rollup_mode) { + SEQ_PUT_DEC("Size: ", vma->vm_end - vma->vm_start); + SEQ_PUT_DEC(" kB\nKernelPageSize: ", vma_kernel_pagesize(vma)); + SEQ_PUT_DEC(" kB\nMMUPageSize: ", vma_mmu_pagesize(vma)); + seq_puts(m, " kB\n"); + } + if (!rollup_mode || last_vma) { + SEQ_PUT_DEC("Rss: ", mss->resident); + SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT); + SEQ_PUT_DEC(" kB\nShared_Clean: ", mss->shared_clean); + SEQ_PUT_DEC(" kB\nShared_Dirty: ", mss->shared_dirty); + SEQ_PUT_DEC(" kB\nPrivate_Clean: ", mss->private_clean); + SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty); + SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced); + SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous); + SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree); + SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp); + SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp); + SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb); + seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ", + mss->private_hugetlb >> 10, 7); + SEQ_PUT_DEC(" kB\nSwap: ", mss->swap); + SEQ_PUT_DEC(" kB\nSwapPss: ", + mss->swap_pss >> PSS_SHIFT); + SEQ_PUT_DEC(" kB\nLocked: ", mss->pss >> PSS_SHIFT); + seq_puts(m, " kB\n"); + } if (!rollup_mode) { arch_show_smap(m, vma); show_smap_vma_flags(m, vma); @@ -861,6 +841,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) m_cache_vma(m, vma); return ret; } +#undef SEQ_PUT_DEC static int show_pid_smap(struct seq_file *m, void *v) { diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index b42e5bd6d8ff..09c19ef91526 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -1,5 +1,6 @@ config PSTORE tristate "Persistent store support" + select CRYPTO if PSTORE_COMPRESS default n help This option enables generic access to platform level @@ -12,35 +13,89 @@ config PSTORE If you don't have a platform persistent store driver, say N. -choice - prompt "Choose compression algorithm" - depends on PSTORE - default PSTORE_ZLIB_COMPRESS - help - This option chooses compression algorithm. - -config PSTORE_ZLIB_COMPRESS - bool "ZLIB" - select ZLIB_DEFLATE - select ZLIB_INFLATE - help - This option enables ZLIB compression algorithm support. +config PSTORE_DEFLATE_COMPRESS + tristate "DEFLATE (ZLIB) compression" + default y + depends on PSTORE + select CRYPTO_DEFLATE + help + This option enables DEFLATE (also known as ZLIB) compression + algorithm support. config PSTORE_LZO_COMPRESS - bool "LZO" - select LZO_COMPRESS - select LZO_DECOMPRESS - help - This option enables LZO compression algorithm support. + tristate "LZO compression" + depends on PSTORE + select CRYPTO_LZO + help + This option enables LZO compression algorithm support. config PSTORE_LZ4_COMPRESS - bool "LZ4" - select LZ4_COMPRESS - select LZ4_DECOMPRESS - help - This option enables LZ4 compression algorithm support. + tristate "LZ4 compression" + depends on PSTORE + select CRYPTO_LZ4 + help + This option enables LZ4 compression algorithm support. + +config PSTORE_LZ4HC_COMPRESS + tristate "LZ4HC compression" + depends on PSTORE + select CRYPTO_LZ4HC + help + This option enables LZ4HC (high compression) mode algorithm. + +config PSTORE_842_COMPRESS + bool "842 compression" + depends on PSTORE + select CRYPTO_842 + help + This option enables 842 compression algorithm support. + +config PSTORE_COMPRESS + def_bool y + depends on PSTORE + depends on PSTORE_DEFLATE_COMPRESS || PSTORE_LZO_COMPRESS || \ + PSTORE_LZ4_COMPRESS || PSTORE_LZ4HC_COMPRESS || \ + PSTORE_842_COMPRESS + +choice + prompt "Default pstore compression algorithm" + depends on PSTORE_COMPRESS + help + This option chooses the default active compression algorithm. + This change be changed at boot with "pstore.compress=..." on + the kernel command line. + + Currently, pstore has support for 5 compression algorithms: + deflate, lzo, lz4, lz4hc and 842. + + The default compression algorithm is deflate. + + config PSTORE_DEFLATE_COMPRESS_DEFAULT + bool "deflate" if PSTORE_DEFLATE_COMPRESS + + config PSTORE_LZO_COMPRESS_DEFAULT + bool "lzo" if PSTORE_LZO_COMPRESS + + config PSTORE_LZ4_COMPRESS_DEFAULT + bool "lz4" if PSTORE_LZ4_COMPRESS + + config PSTORE_LZ4HC_COMPRESS_DEFAULT + bool "lz4hc" if PSTORE_LZ4HC_COMPRESS + + config PSTORE_842_COMPRESS_DEFAULT + bool "842" if PSTORE_842_COMPRESS + endchoice +config PSTORE_COMPRESS_DEFAULT + string + depends on PSTORE_COMPRESS + default "deflate" if PSTORE_DEFLATE_COMPRESS_DEFAULT + default "lzo" if PSTORE_LZO_COMPRESS_DEFAULT + default "lz4" if PSTORE_LZ4_COMPRESS_DEFAULT + default "lz4hc" if PSTORE_LZ4HC_COMPRESS_DEFAULT + default "842" if PSTORE_842_COMPRESS_DEFAULT + config PSTORE_CONSOLE bool "Log kernel console messages" depends on PSTORE diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index d814723fb27d..5fcb845b9fec 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -486,6 +486,8 @@ static int __init init_pstore_fs(void) { int err; + pstore_choose_compression(); + /* Create a convenient mount point for people to access pstore */ err = sysfs_create_mount_point(fs_kobj, "pstore"); if (err) diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index c029314478fa..fb767e28aeb2 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -37,4 +37,7 @@ extern bool pstore_is_mounted(void); extern void pstore_record_init(struct pstore_record *record, struct pstore_info *psi); +/* Called during module_init() */ +extern void __init pstore_choose_compression(void); + #endif diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index c3129b131e4d..dc720573fd53 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -28,15 +28,13 @@ #include <linux/console.h> #include <linux/module.h> #include <linux/pstore.h> -#ifdef CONFIG_PSTORE_ZLIB_COMPRESS -#include <linux/zlib.h> -#endif -#ifdef CONFIG_PSTORE_LZO_COMPRESS +#if IS_ENABLED(CONFIG_PSTORE_LZO_COMPRESS) #include <linux/lzo.h> #endif -#ifdef CONFIG_PSTORE_LZ4_COMPRESS +#if IS_ENABLED(CONFIG_PSTORE_LZ4_COMPRESS) || IS_ENABLED(CONFIG_PSTORE_LZ4HC_COMPRESS) #include <linux/lz4.h> #endif +#include <linux/crypto.h> #include <linux/string.h> #include <linux/timer.h> #include <linux/slab.h> @@ -74,23 +72,18 @@ static DEFINE_SPINLOCK(pstore_lock); struct pstore_info *psinfo; static char *backend; - -/* Compression parameters */ -#ifdef CONFIG_PSTORE_ZLIB_COMPRESS -#define COMPR_LEVEL 6 -#define WINDOW_BITS 12 -#define MEM_LEVEL 4 -static struct z_stream_s stream; +static char *compress = +#ifdef CONFIG_PSTORE_COMPRESS_DEFAULT + CONFIG_PSTORE_COMPRESS_DEFAULT; #else -static unsigned char *workspace; + NULL; #endif -struct pstore_zbackend { - int (*compress)(const void *in, void *out, size_t inlen, size_t outlen); - int (*decompress)(void *in, void *out, size_t inlen, size_t outlen); - void (*allocate)(void); - void (*free)(void); +/* Compression parameters */ +static struct crypto_comp *tfm; +struct pstore_zbackend { + int (*zbufsize)(size_t size); const char *name; }; @@ -149,77 +142,12 @@ bool pstore_cannot_block_path(enum kmsg_dump_reason reason) } EXPORT_SYMBOL_GPL(pstore_cannot_block_path); -#ifdef CONFIG_PSTORE_ZLIB_COMPRESS -/* Derived from logfs_compress() */ -static int compress_zlib(const void *in, void *out, size_t inlen, size_t outlen) -{ - int err, ret; - - ret = -EIO; - err = zlib_deflateInit2(&stream, COMPR_LEVEL, Z_DEFLATED, WINDOW_BITS, - MEM_LEVEL, Z_DEFAULT_STRATEGY); - if (err != Z_OK) - goto error; - - stream.next_in = in; - stream.avail_in = inlen; - stream.total_in = 0; - stream.next_out = out; - stream.avail_out = outlen; - stream.total_out = 0; - - err = zlib_deflate(&stream, Z_FINISH); - if (err != Z_STREAM_END) - goto error; - - err = zlib_deflateEnd(&stream); - if (err != Z_OK) - goto error; - - if (stream.total_out >= stream.total_in) - goto error; - - ret = stream.total_out; -error: - return ret; -} - -/* Derived from logfs_uncompress */ -static int decompress_zlib(void *in, void *out, size_t inlen, size_t outlen) +#if IS_ENABLED(CONFIG_PSTORE_DEFLATE_COMPRESS) +static int zbufsize_deflate(size_t size) { - int err, ret; - - ret = -EIO; - err = zlib_inflateInit2(&stream, WINDOW_BITS); - if (err != Z_OK) - goto error; - - stream.next_in = in; - stream.avail_in = inlen; - stream.total_in = 0; - stream.next_out = out; - stream.avail_out = outlen; - stream.total_out = 0; - - err = zlib_inflate(&stream, Z_FINISH); - if (err != Z_STREAM_END) - goto error; - - err = zlib_inflateEnd(&stream); - if (err != Z_OK) - goto error; - - ret = stream.total_out; -error: - return ret; -} - -static void allocate_zlib(void) -{ - size_t size; size_t cmpr; - switch (psinfo->bufsize) { + switch (size) { /* buffer range for efivars */ case 1000 ... 2000: cmpr = 56; @@ -239,212 +167,131 @@ static void allocate_zlib(void) break; } - big_oops_buf_sz = (psinfo->bufsize * 100) / cmpr; - big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL); - if (big_oops_buf) { - size = max(zlib_deflate_workspacesize(WINDOW_BITS, MEM_LEVEL), - zlib_inflate_workspacesize()); - stream.workspace = kmalloc(size, GFP_KERNEL); - if (!stream.workspace) { - pr_err("No memory for compression workspace; skipping compression\n"); - kfree(big_oops_buf); - big_oops_buf = NULL; - } - } else { - pr_err("No memory for uncompressed data; skipping compression\n"); - stream.workspace = NULL; - } - + return (size * 100) / cmpr; } - -static void free_zlib(void) -{ - kfree(stream.workspace); - stream.workspace = NULL; - kfree(big_oops_buf); - big_oops_buf = NULL; - big_oops_buf_sz = 0; -} - -static const struct pstore_zbackend backend_zlib = { - .compress = compress_zlib, - .decompress = decompress_zlib, - .allocate = allocate_zlib, - .free = free_zlib, - .name = "zlib", -}; #endif -#ifdef CONFIG_PSTORE_LZO_COMPRESS -static int compress_lzo(const void *in, void *out, size_t inlen, size_t outlen) +#if IS_ENABLED(CONFIG_PSTORE_LZO_COMPRESS) +static int zbufsize_lzo(size_t size) { - int ret; - - ret = lzo1x_1_compress(in, inlen, out, &outlen, workspace); - if (ret != LZO_E_OK) { - pr_err("lzo_compress error, ret = %d!\n", ret); - return -EIO; - } - - return outlen; + return lzo1x_worst_compress(size); } +#endif -static int decompress_lzo(void *in, void *out, size_t inlen, size_t outlen) +#if IS_ENABLED(CONFIG_PSTORE_LZ4_COMPRESS) || IS_ENABLED(CONFIG_PSTORE_LZ4HC_COMPRESS) +static int zbufsize_lz4(size_t size) { - int ret; - - ret = lzo1x_decompress_safe(in, inlen, out, &outlen); - if (ret != LZO_E_OK) { - pr_err("lzo_decompress error, ret = %d!\n", ret); - return -EIO; - } - - return outlen; + return LZ4_compressBound(size); } +#endif -static void allocate_lzo(void) +#if IS_ENABLED(CONFIG_PSTORE_842_COMPRESS) +static int zbufsize_842(size_t size) { - big_oops_buf_sz = lzo1x_worst_compress(psinfo->bufsize); - big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL); - if (big_oops_buf) { - workspace = kmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); - if (!workspace) { - pr_err("No memory for compression workspace; skipping compression\n"); - kfree(big_oops_buf); - big_oops_buf = NULL; - } - } else { - pr_err("No memory for uncompressed data; skipping compression\n"); - workspace = NULL; - } + return size; } +#endif -static void free_lzo(void) -{ - kfree(workspace); - kfree(big_oops_buf); - big_oops_buf = NULL; - big_oops_buf_sz = 0; -} +static const struct pstore_zbackend *zbackend __ro_after_init; -static const struct pstore_zbackend backend_lzo = { - .compress = compress_lzo, - .decompress = decompress_lzo, - .allocate = allocate_lzo, - .free = free_lzo, - .name = "lzo", -}; +static const struct pstore_zbackend zbackends[] = { +#if IS_ENABLED(CONFIG_PSTORE_DEFLATE_COMPRESS) + { + .zbufsize = zbufsize_deflate, + .name = "deflate", + }, +#endif +#if IS_ENABLED(CONFIG_PSTORE_LZO_COMPRESS) + { + .zbufsize = zbufsize_lzo, + .name = "lzo", + }, +#endif +#if IS_ENABLED(CONFIG_PSTORE_LZ4_COMPRESS) + { + .zbufsize = zbufsize_lz4, + .name = "lz4", + }, #endif +#if IS_ENABLED(CONFIG_PSTORE_LZ4HC_COMPRESS) + { + .zbufsize = zbufsize_lz4, + .name = "lz4hc", + }, +#endif +#if IS_ENABLED(CONFIG_PSTORE_842_COMPRESS) + { + .zbufsize = zbufsize_842, + .name = "842", + }, +#endif + { } +}; -#ifdef CONFIG_PSTORE_LZ4_COMPRESS -static int compress_lz4(const void *in, void *out, size_t inlen, size_t outlen) +static int pstore_compress(const void *in, void *out, + unsigned int inlen, unsigned int outlen) { int ret; - ret = LZ4_compress_default(in, out, inlen, outlen, workspace); - if (!ret) { - pr_err("LZ4_compress_default error; compression failed!\n"); - return -EIO; + ret = crypto_comp_compress(tfm, in, inlen, out, &outlen); + if (ret) { + pr_err("crypto_comp_compress failed, ret = %d!\n", ret); + return ret; } - return ret; + return outlen; } -static int decompress_lz4(void *in, void *out, size_t inlen, size_t outlen) +static int pstore_decompress(void *in, void *out, + unsigned int inlen, unsigned int outlen) { int ret; - ret = LZ4_decompress_safe(in, out, inlen, outlen); - if (ret < 0) { - /* - * LZ4_decompress_safe will return an error code - * (< 0) if decompression failed - */ - pr_err("LZ4_decompress_safe error, ret = %d!\n", ret); - return -EIO; + ret = crypto_comp_decompress(tfm, in, inlen, out, &outlen); + if (ret) { + pr_err("crypto_comp_decompress failed, ret = %d!\n", ret); + return ret; } - return ret; -} - -static void allocate_lz4(void) -{ - big_oops_buf_sz = LZ4_compressBound(psinfo->bufsize); - big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL); - if (big_oops_buf) { - workspace = kmalloc(LZ4_MEM_COMPRESS, GFP_KERNEL); - if (!workspace) { - pr_err("No memory for compression workspace; skipping compression\n"); - kfree(big_oops_buf); - big_oops_buf = NULL; - } - } else { - pr_err("No memory for uncompressed data; skipping compression\n"); - workspace = NULL; - } + return outlen; } -static void free_lz4(void) +static void allocate_buf_for_compression(void) { - kfree(workspace); - kfree(big_oops_buf); - big_oops_buf = NULL; - big_oops_buf_sz = 0; -} - -static const struct pstore_zbackend backend_lz4 = { - .compress = compress_lz4, - .decompress = decompress_lz4, - .allocate = allocate_lz4, - .free = free_lz4, - .name = "lz4", -}; -#endif - -static const struct pstore_zbackend *zbackend = -#if defined(CONFIG_PSTORE_ZLIB_COMPRESS) - &backend_zlib; -#elif defined(CONFIG_PSTORE_LZO_COMPRESS) - &backend_lzo; -#elif defined(CONFIG_PSTORE_LZ4_COMPRESS) - &backend_lz4; -#else - NULL; -#endif + if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS) || !zbackend) + return; -static int pstore_compress(const void *in, void *out, - size_t inlen, size_t outlen) -{ - if (zbackend) - return zbackend->compress(in, out, inlen, outlen); - else - return -EIO; -} + if (!crypto_has_comp(zbackend->name, 0, 0)) { + pr_err("No %s compression\n", zbackend->name); + return; + } -static int pstore_decompress(void *in, void *out, size_t inlen, size_t outlen) -{ - if (zbackend) - return zbackend->decompress(in, out, inlen, outlen); - else - return -EIO; -} + big_oops_buf_sz = zbackend->zbufsize(psinfo->bufsize); + if (big_oops_buf_sz <= 0) + return; -static void allocate_buf_for_compression(void) -{ - if (zbackend) { - pr_info("using %s compression\n", zbackend->name); - zbackend->allocate(); - } else { + big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL); + if (!big_oops_buf) { pr_err("allocate compression buffer error!\n"); + return; + } + + tfm = crypto_alloc_comp(zbackend->name, 0, 0); + if (IS_ERR_OR_NULL(tfm)) { + kfree(big_oops_buf); + big_oops_buf = NULL; + pr_err("crypto_alloc_comp() failed!\n"); + return; } } static void free_buf_for_compression(void) { - if (zbackend) - zbackend->free(); - else - pr_err("free compression buffer error!\n"); + if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && !IS_ERR_OR_NULL(tfm)) + crypto_free_comp(tfm); + kfree(big_oops_buf); + big_oops_buf = NULL; + big_oops_buf_sz = 0; } /* @@ -901,5 +748,24 @@ static void pstore_timefunc(struct timer_list *unused) jiffies + msecs_to_jiffies(pstore_update_ms)); } +void __init pstore_choose_compression(void) +{ + const struct pstore_zbackend *step; + + if (!compress) + return; + + for (step = zbackends; step->name; step++) { + if (!strcmp(compress, step->name)) { + zbackend = step; + pr_info("using %s compression\n", zbackend->name); + return; + } + } +} + +module_param(compress, charp, 0444); +MODULE_PARM_DESC(compress, "Pstore compression to use"); + module_param(backend, charp, 0444); MODULE_PARM_DESC(backend, "Pstore backend to use"); diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 7125b398d312..49b2bc114868 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -938,7 +938,7 @@ static int __init ramoops_init(void) ramoops_register_dummy(); return platform_driver_register(&ramoops_driver); } -postcore_initcall(ramoops_init); +late_initcall(ramoops_init); static void __exit ramoops_exit(void) { diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index e11672aa4575..951a14edcf51 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -98,24 +98,23 @@ static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz, uint8_t *data, size_t len, uint8_t *ecc) { int i; - uint16_t par[prz->ecc_info.ecc_size]; /* Initialize the parity buffer */ - memset(par, 0, sizeof(par)); - encode_rs8(prz->rs_decoder, data, len, par, 0); + memset(prz->ecc_info.par, 0, + prz->ecc_info.ecc_size * sizeof(prz->ecc_info.par[0])); + encode_rs8(prz->rs_decoder, data, len, prz->ecc_info.par, 0); for (i = 0; i < prz->ecc_info.ecc_size; i++) - ecc[i] = par[i]; + ecc[i] = prz->ecc_info.par[i]; } static int persistent_ram_decode_rs8(struct persistent_ram_zone *prz, void *data, size_t len, uint8_t *ecc) { int i; - uint16_t par[prz->ecc_info.ecc_size]; for (i = 0; i < prz->ecc_info.ecc_size; i++) - par[i] = ecc[i]; - return decode_rs8(prz->rs_decoder, data, par, len, + prz->ecc_info.par[i] = ecc[i]; + return decode_rs8(prz->rs_decoder, data, prz->ecc_info.par, len, NULL, 0, NULL, 0, NULL); } @@ -228,6 +227,15 @@ static int persistent_ram_init_ecc(struct persistent_ram_zone *prz, return -EINVAL; } + /* allocate workspace instead of using stack VLA */ + prz->ecc_info.par = kmalloc_array(prz->ecc_info.ecc_size, + sizeof(*prz->ecc_info.par), + GFP_KERNEL); + if (!prz->ecc_info.par) { + pr_err("cannot allocate ECC parity workspace\n"); + return -ENOMEM; + } + prz->corrected_bytes = 0; prz->bad_blocks = 0; @@ -514,6 +522,13 @@ void persistent_ram_free(struct persistent_ram_zone *prz) } prz->vaddr = NULL; } + if (prz->rs_decoder) { + free_rs(prz->rs_decoder); + prz->rs_decoder = NULL; + } + kfree(prz->ecc_info.par); + prz->ecc_info.par = NULL; + persistent_ram_free_old(prz); kfree(prz); } diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 70057359fbaf..23148c3ed675 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2643,7 +2643,7 @@ static int journal_init_dev(struct super_block *super, if (IS_ERR(journal->j_dev_bd)) { result = PTR_ERR(journal->j_dev_bd); journal->j_dev_bd = NULL; - reiserfs_warning(super, + reiserfs_warning(super, "sh-457", "journal_init_dev: Cannot open '%s': %i", jdev_name, result); return result; diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index 48835a659948..ae4811fecc1f 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -1916,7 +1916,7 @@ struct reiserfs_de_head { /* empty directory contains two entries "." and ".." and their headers */ #define EMPTY_DIR_SIZE \ -(DEH_SIZE * 2 + ROUND_UP (strlen (".")) + ROUND_UP (strlen (".."))) +(DEH_SIZE * 2 + ROUND_UP (sizeof(".") - 1) + ROUND_UP (sizeof("..") - 1)) /* old format directories have this size when empty */ #define EMPTY_DIR_SIZE_V1 (DEH_SIZE * 2 + 3) diff --git a/fs/seq_file.c b/fs/seq_file.c index eea09f6d8830..c6c27f1f9c98 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -6,6 +6,7 @@ * initial implementation -- AV, Oct 2001. */ +#include <linux/cache.h> #include <linux/fs.h> #include <linux/export.h> #include <linux/seq_file.h> @@ -19,6 +20,8 @@ #include <linux/uaccess.h> #include <asm/page.h> +static struct kmem_cache *seq_file_cache __ro_after_init; + static void seq_set_overflow(struct seq_file *m) { m->count = m->size; @@ -26,7 +29,7 @@ static void seq_set_overflow(struct seq_file *m) static void *seq_buf_alloc(unsigned long size) { - return kvmalloc(size, GFP_KERNEL); + return kvmalloc(size, GFP_KERNEL_ACCOUNT); } /** @@ -51,7 +54,7 @@ int seq_open(struct file *file, const struct seq_operations *op) WARN_ON(file->private_data); - p = kzalloc(sizeof(*p), GFP_KERNEL); + p = kmem_cache_zalloc(seq_file_cache, GFP_KERNEL); if (!p) return -ENOMEM; @@ -366,7 +369,7 @@ int seq_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; kvfree(m->buf); - kfree(m); + kmem_cache_free(seq_file_cache, m); return 0; } EXPORT_SYMBOL(seq_release); @@ -563,7 +566,7 @@ static void single_stop(struct seq_file *p, void *v) int single_open(struct file *file, int (*show)(struct seq_file *, void *), void *data) { - struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL); + struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL_ACCOUNT); int res = -ENOMEM; if (op) { @@ -625,7 +628,7 @@ void *__seq_open_private(struct file *f, const struct seq_operations *ops, void *private; struct seq_file *seq; - private = kzalloc(psize, GFP_KERNEL); + private = kzalloc(psize, GFP_KERNEL_ACCOUNT); if (private == NULL) goto out; @@ -673,29 +676,37 @@ void seq_puts(struct seq_file *m, const char *s) } EXPORT_SYMBOL(seq_puts); -/* +/** * A helper routine for putting decimal numbers without rich format of printf(). * only 'unsigned long long' is supported. - * This routine will put strlen(delimiter) + number into seq_file. + * @m: seq_file identifying the buffer to which data should be written + * @delimiter: a string which is printed before the number + * @num: the number + * @width: a minimum field width + * + * This routine will put strlen(delimiter) + number into seq_filed. * This routine is very quick when you show lots of numbers. * In usual cases, it will be better to use seq_printf(). It's easier to read. */ -void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, - unsigned long long num) +void seq_put_decimal_ull_width(struct seq_file *m, const char *delimiter, + unsigned long long num, unsigned int width) { int len; if (m->count + 2 >= m->size) /* we'll write 2 bytes at least */ goto overflow; - len = strlen(delimiter); - if (m->count + len >= m->size) - goto overflow; + if (delimiter && delimiter[0]) { + if (delimiter[1] == 0) + seq_putc(m, delimiter[0]); + else + seq_puts(m, delimiter); + } - memcpy(m->buf + m->count, delimiter, len); - m->count += len; + if (!width) + width = 1; - if (m->count + 1 >= m->size) + if (m->count + width >= m->size) goto overflow; if (num < 10) { @@ -703,7 +714,7 @@ void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, return; } - len = num_to_str(m->buf + m->count, m->size - m->count, num); + len = num_to_str(m->buf + m->count, m->size - m->count, num, width); if (!len) goto overflow; @@ -713,8 +724,60 @@ void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, overflow: seq_set_overflow(m); } + +void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, + unsigned long long num) +{ + return seq_put_decimal_ull_width(m, delimiter, num, 0); +} EXPORT_SYMBOL(seq_put_decimal_ull); +/** + * seq_put_hex_ll - put a number in hexadecimal notation + * @m: seq_file identifying the buffer to which data should be written + * @delimiter: a string which is printed before the number + * @v: the number + * @width: a minimum field width + * + * seq_put_hex_ll(m, "", v, 8) is equal to seq_printf(m, "%08llx", v) + * + * This routine is very quick when you show lots of numbers. + * In usual cases, it will be better to use seq_printf(). It's easier to read. + */ +void seq_put_hex_ll(struct seq_file *m, const char *delimiter, + unsigned long long v, unsigned int width) +{ + unsigned int len; + int i; + + if (delimiter && delimiter[0]) { + if (delimiter[1] == 0) + seq_putc(m, delimiter[0]); + else + seq_puts(m, delimiter); + } + + /* If x is 0, the result of __builtin_clzll is undefined */ + if (v == 0) + len = 1; + else + len = (sizeof(v) * 8 - __builtin_clzll(v) + 3) / 4; + + if (len < width) + len = width; + + if (m->count + len > m->size) { + seq_set_overflow(m); + return; + } + + for (i = len - 1; i >= 0; i--) { + m->buf[m->count + i] = hex_asc[0xf & v]; + v = v >> 4; + } + m->count += len; +} + void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num) { int len; @@ -722,12 +785,12 @@ void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num if (m->count + 3 >= m->size) /* we'll write 2 bytes at least */ goto overflow; - len = strlen(delimiter); - if (m->count + len >= m->size) - goto overflow; - - memcpy(m->buf + m->count, delimiter, len); - m->count += len; + if (delimiter && delimiter[0]) { + if (delimiter[1] == 0) + seq_putc(m, delimiter[0]); + else + seq_puts(m, delimiter); + } if (m->count + 2 >= m->size) goto overflow; @@ -742,7 +805,7 @@ void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num return; } - len = num_to_str(m->buf + m->count, m->size - m->count, num); + len = num_to_str(m->buf + m->count, m->size - m->count, num, 0); if (!len) goto overflow; @@ -782,8 +845,14 @@ EXPORT_SYMBOL(seq_write); void seq_pad(struct seq_file *m, char c) { int size = m->pad_until - m->count; - if (size > 0) - seq_printf(m, "%*s", size, ""); + if (size > 0) { + if (size + m->count > m->size) { + seq_set_overflow(m); + return; + } + memset(m->buf + m->count, ' ', size); + m->count += size; + } if (c) seq_putc(m, c); } @@ -1040,3 +1109,8 @@ seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head, return NULL; } EXPORT_SYMBOL(seq_hlist_next_percpu); + +void __init seq_file_init(void) +{ + seq_file_cache = KMEM_CACHE(seq_file, SLAB_ACCOUNT|SLAB_PANIC); +} diff --git a/fs/super.c b/fs/super.c index 672538ca9831..5fa9a8d8d865 100644 --- a/fs/super.c +++ b/fs/super.c @@ -37,6 +37,7 @@ #include <linux/user_namespace.h> #include "internal.h" +static int thaw_super_locked(struct super_block *sb); static LIST_HEAD(super_blocks); static DEFINE_SPINLOCK(sb_lock); @@ -574,6 +575,28 @@ void drop_super_exclusive(struct super_block *sb) } EXPORT_SYMBOL(drop_super_exclusive); +static void __iterate_supers(void (*f)(struct super_block *)) +{ + struct super_block *sb, *p = NULL; + + spin_lock(&sb_lock); + list_for_each_entry(sb, &super_blocks, s_list) { + if (hlist_unhashed(&sb->s_instances)) + continue; + sb->s_count++; + spin_unlock(&sb_lock); + + f(sb); + + spin_lock(&sb_lock); + if (p) + __put_super(p); + p = sb; + } + if (p) + __put_super(p); + spin_unlock(&sb_lock); +} /** * iterate_supers - call function for all active superblocks * @f: function to call @@ -881,33 +904,22 @@ cancel_readonly: return retval; } -static void do_emergency_remount(struct work_struct *work) +static void do_emergency_remount_callback(struct super_block *sb) { - struct super_block *sb, *p = NULL; - - spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) { - if (hlist_unhashed(&sb->s_instances)) - continue; - sb->s_count++; - spin_unlock(&sb_lock); - down_write(&sb->s_umount); - if (sb->s_root && sb->s_bdev && (sb->s_flags & SB_BORN) && - !sb_rdonly(sb)) { - /* - * What lock protects sb->s_flags?? - */ - do_remount_sb(sb, SB_RDONLY, NULL, 1); - } - up_write(&sb->s_umount); - spin_lock(&sb_lock); - if (p) - __put_super(p); - p = sb; + down_write(&sb->s_umount); + if (sb->s_root && sb->s_bdev && (sb->s_flags & SB_BORN) && + !sb_rdonly(sb)) { + /* + * What lock protects sb->s_flags?? + */ + do_remount_sb(sb, SB_RDONLY, NULL, 1); } - if (p) - __put_super(p); - spin_unlock(&sb_lock); + up_write(&sb->s_umount); +} + +static void do_emergency_remount(struct work_struct *work) +{ + __iterate_supers(do_emergency_remount_callback); kfree(work); printk("Emergency Remount complete\n"); } @@ -923,6 +935,40 @@ void emergency_remount(void) } } +static void do_thaw_all_callback(struct super_block *sb) +{ + down_write(&sb->s_umount); + if (sb->s_root && sb->s_flags & MS_BORN) { + emergency_thaw_bdev(sb); + thaw_super_locked(sb); + } else { + up_write(&sb->s_umount); + } +} + +static void do_thaw_all(struct work_struct *work) +{ + __iterate_supers(do_thaw_all_callback); + kfree(work); + printk(KERN_WARNING "Emergency Thaw complete\n"); +} + +/** + * emergency_thaw_all -- forcibly thaw every frozen filesystem + * + * Used for emergency unfreeze of all filesystems via SysRq + */ +void emergency_thaw_all(void) +{ + struct work_struct *work; + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (work) { + INIT_WORK(work, do_thaw_all); + schedule_work(work); + } +} + /* * Unnamed block devices are dummy devices used by virtual * filesystems which don't use real block-devices. -- jrs @@ -1492,11 +1538,10 @@ EXPORT_SYMBOL(freeze_super); * * Unlocks the filesystem and marks it writeable again after freeze_super(). */ -int thaw_super(struct super_block *sb) +static int thaw_super_locked(struct super_block *sb) { int error; - down_write(&sb->s_umount); if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) { up_write(&sb->s_umount); return -EINVAL; @@ -1527,4 +1572,10 @@ out: deactivate_locked_super(sb); return 0; } + +int thaw_super(struct super_block *sb) +{ + down_write(&sb->s_umount); + return thaw_super_locked(sb); +} EXPORT_SYMBOL(thaw_super); diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index cf348ba99238..1acb2ff505e6 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1256,7 +1256,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode, * Inode length changed, so we have to make sure * @I_DIRTY_DATASYNC is set. */ - __mark_inode_dirty(inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC); + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); else mark_inode_dirty_sync(inode); mutex_unlock(&ui->ui_mutex); diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c index 2dcf3d473fec..9571616b5dda 100644 --- a/fs/ubifs/find.c +++ b/fs/ubifs/find.c @@ -632,7 +632,7 @@ static int scan_for_idx_cb(struct ubifs_info *c, */ static const struct ubifs_lprops *scan_for_leb_for_idx(struct ubifs_info *c) { - struct ubifs_lprops *lprops; + const struct ubifs_lprops *lprops; struct scan_data data; int err; diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index 6c3a1abd0e22..f5a46844340c 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -244,7 +244,6 @@ static void remove_from_lpt_heap(struct ubifs_info *c, /** * lpt_heap_replace - replace lprops in a category heap. * @c: UBIFS file-system description object - * @old_lprops: LEB properties to replace * @new_lprops: LEB properties with which to replace * @cat: LEB category * @@ -254,7 +253,6 @@ static void remove_from_lpt_heap(struct ubifs_info *c, * lprops. This function does that. */ static void lpt_heap_replace(struct ubifs_info *c, - struct ubifs_lprops *old_lprops, struct ubifs_lprops *new_lprops, int cat) { struct ubifs_lpt_heap *heap; @@ -362,7 +360,7 @@ void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops, case LPROPS_DIRTY: case LPROPS_DIRTY_IDX: case LPROPS_FREE: - lpt_heap_replace(c, old_lprops, new_lprops, cat); + lpt_heap_replace(c, new_lprops, cat); break; case LPROPS_UNCAT: case LPROPS_EMPTY: diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c index aab87340d3de..16f03d9929e5 100644 --- a/fs/ubifs/scan.c +++ b/fs/ubifs/scan.c @@ -175,7 +175,6 @@ struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum, void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb, int lnum, int offs) { - lnum = lnum; dbg_scan("stop scanning LEB %d at offset %d", lnum, offs); ubifs_assert(offs % c->min_io_size == 0); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index b16ef162344a..6c397a389105 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1737,8 +1737,11 @@ static void ubifs_remount_ro(struct ubifs_info *c) dbg_save_space_info(c); - for (i = 0; i < c->jhead_cnt; i++) - ubifs_wbuf_sync(&c->jheads[i].wbuf); + for (i = 0; i < c->jhead_cnt; i++) { + err = ubifs_wbuf_sync(&c->jheads[i].wbuf); + if (err) + ubifs_ro_mode(c, err); + } c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); @@ -1804,8 +1807,11 @@ static void ubifs_put_super(struct super_block *sb) int err; /* Synchronize write-buffers */ - for (i = 0; i < c->jhead_cnt; i++) - ubifs_wbuf_sync(&c->jheads[i].wbuf); + for (i = 0; i < c->jhead_cnt; i++) { + err = ubifs_wbuf_sync(&c->jheads[i].wbuf); + if (err) + ubifs_ro_mode(c, err); + } /* * We are being cleanly unmounted which means the diff --git a/fs/udf/file.c b/fs/udf/file.c index 356c2bf148a5..cd31e4f6d6da 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -257,12 +257,22 @@ const struct file_operations udf_file_operations = { static int udf_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); + struct super_block *sb = inode->i_sb; int error; error = setattr_prepare(dentry, attr); if (error) return error; + if ((attr->ia_valid & ATTR_UID) && + UDF_QUERY_FLAG(sb, UDF_FLAG_UID_SET) && + !uid_eq(attr->ia_uid, UDF_SB(sb)->s_uid)) + return -EPERM; + if ((attr->ia_valid & ATTR_GID) && + UDF_QUERY_FLAG(sb, UDF_FLAG_GID_SET) && + !gid_eq(attr->ia_gid, UDF_SB(sb)->s_gid)) + return -EPERM; + if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size != i_size_read(inode)) { error = udf_setsize(inode, attr->ia_size); diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index b6e420c1bfeb..b7a0d4b4bda1 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -104,6 +104,10 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode) } inode_init_owner(inode, dir, mode); + if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_SET)) + inode->i_uid = sbi->s_uid; + if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_SET)) + inode->i_gid = sbi->s_gid; iinfo->i_location.logicalBlockNum = block; iinfo->i_location.partitionReferenceNum = diff --git a/fs/udf/inode.c b/fs/udf/inode.c index c23744d5ae5c..c80765d62f7e 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1275,6 +1275,7 @@ static int udf_read_inode(struct inode *inode, bool hidden_inode) unsigned int indirections = 0; int bs = inode->i_sb->s_blocksize; int ret = -EIO; + uint32_t uid, gid; reread: if (iloc->partitionReferenceNum >= sbi->s_partitions) { @@ -1400,17 +1401,19 @@ reread: ret = -EIO; read_lock(&sbi->s_cred_lock); - i_uid_write(inode, le32_to_cpu(fe->uid)); - if (!uid_valid(inode->i_uid) || - UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_IGNORE) || + uid = le32_to_cpu(fe->uid); + if (uid == UDF_INVALID_ID || UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_SET)) - inode->i_uid = UDF_SB(inode->i_sb)->s_uid; + inode->i_uid = sbi->s_uid; + else + i_uid_write(inode, uid); - i_gid_write(inode, le32_to_cpu(fe->gid)); - if (!gid_valid(inode->i_gid) || - UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_IGNORE) || + gid = le32_to_cpu(fe->gid); + if (gid == UDF_INVALID_ID || UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_SET)) - inode->i_gid = UDF_SB(inode->i_sb)->s_gid; + inode->i_gid = sbi->s_gid; + else + i_gid_write(inode, gid); if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY && sbi->s_fmode != UDF_INVALID_MODE) @@ -1655,12 +1658,12 @@ static int udf_update_inode(struct inode *inode, int do_sync) } if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET)) - fe->uid = cpu_to_le32(-1); + fe->uid = cpu_to_le32(UDF_INVALID_ID); else fe->uid = cpu_to_le32(i_uid_read(inode)); if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_FORGET)) - fe->gid = cpu_to_le32(-1); + fe->gid = cpu_to_le32(UDF_INVALID_ID); else fe->gid = cpu_to_le32(i_gid_read(inode)); diff --git a/fs/udf/super.c b/fs/udf/super.c index f73239a9a97d..7949c338efa5 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -64,14 +64,13 @@ #include <linux/init.h> #include <linux/uaccess.h> -#define VDS_POS_PRIMARY_VOL_DESC 0 -#define VDS_POS_UNALLOC_SPACE_DESC 1 -#define VDS_POS_LOGICAL_VOL_DESC 2 -#define VDS_POS_PARTITION_DESC 3 -#define VDS_POS_IMP_USE_VOL_DESC 4 -#define VDS_POS_VOL_DESC_PTR 5 -#define VDS_POS_TERMINATING_DESC 6 -#define VDS_POS_LENGTH 7 +enum { + VDS_POS_PRIMARY_VOL_DESC, + VDS_POS_UNALLOC_SPACE_DESC, + VDS_POS_LOGICAL_VOL_DESC, + VDS_POS_IMP_USE_VOL_DESC, + VDS_POS_LENGTH +}; #define VSD_FIRST_SECTOR_OFFSET 32768 #define VSD_MAX_SECTOR_OFFSET 0x800000 @@ -223,10 +222,6 @@ struct udf_options { unsigned int session; unsigned int lastblock; unsigned int anchor; - unsigned int volume; - unsigned short partition; - unsigned int fileset; - unsigned int rootdir; unsigned int flags; umode_t umask; kgid_t gid; @@ -349,12 +344,8 @@ static int udf_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",shortad"); if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_FORGET)) seq_puts(seq, ",uid=forget"); - if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_IGNORE)) - seq_puts(seq, ",uid=ignore"); if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_FORGET)) seq_puts(seq, ",gid=forget"); - if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_IGNORE)) - seq_puts(seq, ",gid=ignore"); if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_SET)) seq_printf(seq, ",uid=%u", from_kuid(&init_user_ns, sbi->s_uid)); if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_SET)) @@ -371,10 +362,6 @@ static int udf_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",lastblock=%u", sbi->s_last_block); if (sbi->s_anchor != 0) seq_printf(seq, ",anchor=%u", sbi->s_anchor); - /* - * volume, partition, fileset and rootdir seem to be ignored - * currently - */ if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) seq_puts(seq, ",utf8"); if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP) && sbi->s_nls_map) @@ -487,14 +474,9 @@ static int udf_parse_options(char *options, struct udf_options *uopt, int option; uopt->novrs = 0; - uopt->partition = 0xFFFF; uopt->session = 0xFFFFFFFF; uopt->lastblock = 0; uopt->anchor = 0; - uopt->volume = 0xFFFFFFFF; - uopt->rootdir = 0xFFFFFFFF; - uopt->fileset = 0xFFFFFFFF; - uopt->nls_map = NULL; if (!options) return 1; @@ -582,42 +564,30 @@ static int udf_parse_options(char *options, struct udf_options *uopt, uopt->anchor = option; break; case Opt_volume: - if (match_int(args, &option)) - return 0; - uopt->volume = option; - break; case Opt_partition: - if (match_int(args, &option)) - return 0; - uopt->partition = option; - break; case Opt_fileset: - if (match_int(args, &option)) - return 0; - uopt->fileset = option; - break; case Opt_rootdir: - if (match_int(args, &option)) - return 0; - uopt->rootdir = option; + /* Ignored (never implemented properly) */ break; case Opt_utf8: uopt->flags |= (1 << UDF_FLAG_UTF8); break; #ifdef CONFIG_UDF_NLS case Opt_iocharset: - uopt->nls_map = load_nls(args[0].from); - uopt->flags |= (1 << UDF_FLAG_NLS_MAP); + if (!remount) { + if (uopt->nls_map) + unload_nls(uopt->nls_map); + uopt->nls_map = load_nls(args[0].from); + uopt->flags |= (1 << UDF_FLAG_NLS_MAP); + } break; #endif - case Opt_uignore: - uopt->flags |= (1 << UDF_FLAG_UID_IGNORE); - break; case Opt_uforget: uopt->flags |= (1 << UDF_FLAG_UID_FORGET); break; + case Opt_uignore: case Opt_gignore: - uopt->flags |= (1 << UDF_FLAG_GID_IGNORE); + /* These options are superseeded by uid=<number> */ break; case Opt_gforget: uopt->flags |= (1 << UDF_FLAG_GID_FORGET); @@ -660,6 +630,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options) uopt.umask = sbi->s_umask; uopt.fmode = sbi->s_fmode; uopt.dmode = sbi->s_dmode; + uopt.nls_map = NULL; if (!udf_parse_options(options, &uopt, true)) return -EINVAL; @@ -1592,6 +1563,60 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ sbi->s_lvid_bh = NULL; } +/* + * Step for reallocation of table of partition descriptor sequence numbers. + * Must be power of 2. + */ +#define PART_DESC_ALLOC_STEP 32 + +struct desc_seq_scan_data { + struct udf_vds_record vds[VDS_POS_LENGTH]; + unsigned int size_part_descs; + struct udf_vds_record *part_descs_loc; +}; + +static struct udf_vds_record *handle_partition_descriptor( + struct buffer_head *bh, + struct desc_seq_scan_data *data) +{ + struct partitionDesc *desc = (struct partitionDesc *)bh->b_data; + int partnum; + + partnum = le16_to_cpu(desc->partitionNumber); + if (partnum >= data->size_part_descs) { + struct udf_vds_record *new_loc; + unsigned int new_size = ALIGN(partnum, PART_DESC_ALLOC_STEP); + + new_loc = kzalloc(sizeof(*new_loc) * new_size, GFP_KERNEL); + if (!new_loc) + return ERR_PTR(-ENOMEM); + memcpy(new_loc, data->part_descs_loc, + data->size_part_descs * sizeof(*new_loc)); + kfree(data->part_descs_loc); + data->part_descs_loc = new_loc; + data->size_part_descs = new_size; + } + return &(data->part_descs_loc[partnum]); +} + + +static struct udf_vds_record *get_volume_descriptor_record(uint16_t ident, + struct buffer_head *bh, struct desc_seq_scan_data *data) +{ + switch (ident) { + case TAG_IDENT_PVD: /* ISO 13346 3/10.1 */ + return &(data->vds[VDS_POS_PRIMARY_VOL_DESC]); + case TAG_IDENT_IUVD: /* ISO 13346 3/10.4 */ + return &(data->vds[VDS_POS_IMP_USE_VOL_DESC]); + case TAG_IDENT_LVD: /* ISO 13346 3/10.6 */ + return &(data->vds[VDS_POS_LOGICAL_VOL_DESC]); + case TAG_IDENT_USD: /* ISO 13346 3/10.8 */ + return &(data->vds[VDS_POS_UNALLOC_SPACE_DESC]); + case TAG_IDENT_PD: /* ISO 13346 3/10.5 */ + return handle_partition_descriptor(bh, data); + } + return NULL; +} /* * Process a main/reserve volume descriptor sequence. @@ -1608,18 +1633,23 @@ static noinline int udf_process_sequence( struct kernel_lb_addr *fileset) { struct buffer_head *bh = NULL; - struct udf_vds_record vds[VDS_POS_LENGTH]; struct udf_vds_record *curr; struct generic_desc *gd; struct volDescPtr *vdp; bool done = false; uint32_t vdsn; uint16_t ident; - long next_s = 0, next_e = 0; int ret; unsigned int indirections = 0; - - memset(vds, 0, sizeof(struct udf_vds_record) * VDS_POS_LENGTH); + struct desc_seq_scan_data data; + unsigned int i; + + memset(data.vds, 0, sizeof(struct udf_vds_record) * VDS_POS_LENGTH); + data.size_part_descs = PART_DESC_ALLOC_STEP; + data.part_descs_loc = kzalloc(sizeof(*data.part_descs_loc) * + data.size_part_descs, GFP_KERNEL); + if (!data.part_descs_loc) + return -ENOMEM; /* * Read the main descriptor sequence and find which descriptors @@ -1628,79 +1658,51 @@ static noinline int udf_process_sequence( for (; (!done && block <= lastblock); block++) { bh = udf_read_tagged(sb, block, block, &ident); - if (!bh) { - udf_err(sb, - "Block %llu of volume descriptor sequence is corrupted or we could not read it\n", - (unsigned long long)block); - return -EAGAIN; - } + if (!bh) + break; /* Process each descriptor (ISO 13346 3/8.3-8.4) */ gd = (struct generic_desc *)bh->b_data; vdsn = le32_to_cpu(gd->volDescSeqNum); switch (ident) { - case TAG_IDENT_PVD: /* ISO 13346 3/10.1 */ - curr = &vds[VDS_POS_PRIMARY_VOL_DESC]; - if (vdsn >= curr->volDescSeqNum) { - curr->volDescSeqNum = vdsn; - curr->block = block; - } - break; case TAG_IDENT_VDP: /* ISO 13346 3/10.3 */ - curr = &vds[VDS_POS_VOL_DESC_PTR]; - if (vdsn >= curr->volDescSeqNum) { - curr->volDescSeqNum = vdsn; - curr->block = block; - - vdp = (struct volDescPtr *)bh->b_data; - next_s = le32_to_cpu( - vdp->nextVolDescSeqExt.extLocation); - next_e = le32_to_cpu( - vdp->nextVolDescSeqExt.extLength); - next_e = next_e >> sb->s_blocksize_bits; - next_e += next_s; + if (++indirections > UDF_MAX_TD_NESTING) { + udf_err(sb, "too many Volume Descriptor " + "Pointers (max %u supported)\n", + UDF_MAX_TD_NESTING); + brelse(bh); + return -EIO; } + + vdp = (struct volDescPtr *)bh->b_data; + block = le32_to_cpu(vdp->nextVolDescSeqExt.extLocation); + lastblock = le32_to_cpu( + vdp->nextVolDescSeqExt.extLength) >> + sb->s_blocksize_bits; + lastblock += block - 1; + /* For loop is going to increment 'block' again */ + block--; break; + case TAG_IDENT_PVD: /* ISO 13346 3/10.1 */ case TAG_IDENT_IUVD: /* ISO 13346 3/10.4 */ - curr = &vds[VDS_POS_IMP_USE_VOL_DESC]; - if (vdsn >= curr->volDescSeqNum) { - curr->volDescSeqNum = vdsn; - curr->block = block; - } - break; - case TAG_IDENT_PD: /* ISO 13346 3/10.5 */ - curr = &vds[VDS_POS_PARTITION_DESC]; - if (!curr->block) - curr->block = block; - break; case TAG_IDENT_LVD: /* ISO 13346 3/10.6 */ - curr = &vds[VDS_POS_LOGICAL_VOL_DESC]; - if (vdsn >= curr->volDescSeqNum) { - curr->volDescSeqNum = vdsn; - curr->block = block; - } - break; case TAG_IDENT_USD: /* ISO 13346 3/10.8 */ - curr = &vds[VDS_POS_UNALLOC_SPACE_DESC]; + case TAG_IDENT_PD: /* ISO 13346 3/10.5 */ + curr = get_volume_descriptor_record(ident, bh, &data); + if (IS_ERR(curr)) { + brelse(bh); + return PTR_ERR(curr); + } + /* Descriptor we don't care about? */ + if (!curr) + break; if (vdsn >= curr->volDescSeqNum) { curr->volDescSeqNum = vdsn; curr->block = block; } break; case TAG_IDENT_TD: /* ISO 13346 3/10.9 */ - if (++indirections > UDF_MAX_TD_NESTING) { - udf_err(sb, "too many TDs (max %u supported)\n", UDF_MAX_TD_NESTING); - brelse(bh); - return -EIO; - } - - vds[VDS_POS_TERMINATING_DESC].block = block; - if (next_e) { - block = next_s; - lastblock = next_e; - next_s = next_e = 0; - } else - done = true; + done = true; break; } brelse(bh); @@ -1709,31 +1711,27 @@ static noinline int udf_process_sequence( * Now read interesting descriptors again and process them * in a suitable order */ - if (!vds[VDS_POS_PRIMARY_VOL_DESC].block) { + if (!data.vds[VDS_POS_PRIMARY_VOL_DESC].block) { udf_err(sb, "Primary Volume Descriptor not found!\n"); return -EAGAIN; } - ret = udf_load_pvoldesc(sb, vds[VDS_POS_PRIMARY_VOL_DESC].block); + ret = udf_load_pvoldesc(sb, data.vds[VDS_POS_PRIMARY_VOL_DESC].block); if (ret < 0) return ret; - if (vds[VDS_POS_LOGICAL_VOL_DESC].block) { + if (data.vds[VDS_POS_LOGICAL_VOL_DESC].block) { ret = udf_load_logicalvol(sb, - vds[VDS_POS_LOGICAL_VOL_DESC].block, - fileset); + data.vds[VDS_POS_LOGICAL_VOL_DESC].block, + fileset); if (ret < 0) return ret; } - if (vds[VDS_POS_PARTITION_DESC].block) { - /* - * We rescan the whole descriptor sequence to find - * partition descriptor blocks and process them. - */ - for (block = vds[VDS_POS_PARTITION_DESC].block; - block < vds[VDS_POS_TERMINATING_DESC].block; - block++) { - ret = udf_load_partdesc(sb, block); + /* Now handle prevailing Partition Descriptors */ + for (i = 0; i < data.size_part_descs; i++) { + if (data.part_descs_loc[i].block) { + ret = udf_load_partdesc(sb, + data.part_descs_loc[i].block); if (ret < 0) return ret; } @@ -1760,13 +1758,13 @@ static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh, main_s = le32_to_cpu(anchor->mainVolDescSeqExt.extLocation); main_e = le32_to_cpu(anchor->mainVolDescSeqExt.extLength); main_e = main_e >> sb->s_blocksize_bits; - main_e += main_s; + main_e += main_s - 1; /* Locate the reserve sequence */ reserve_s = le32_to_cpu(anchor->reserveVolDescSeqExt.extLocation); reserve_e = le32_to_cpu(anchor->reserveVolDescSeqExt.extLength); reserve_e = reserve_e >> sb->s_blocksize_bits; - reserve_e += reserve_s; + reserve_e += reserve_s - 1; /* Process the main & reserve sequences */ /* responsible for finding the PartitionDesc(s) */ @@ -1994,7 +1992,10 @@ static void udf_open_lvid(struct super_block *sb) lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; ktime_get_real_ts(&ts); udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts); - lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN); + if (le32_to_cpu(lvid->integrityType) == LVID_INTEGRITY_TYPE_CLOSE) + lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN); + else + UDF_SET_FLAG(sb, UDF_FLAG_INCONSISTENT); lvid->descTag.descCRC = cpu_to_le16( crc_itu_t(0, (char *)lvid + sizeof(struct tag), @@ -2034,7 +2035,8 @@ static void udf_close_lvid(struct super_block *sb) lvidiu->minUDFReadRev = cpu_to_le16(sbi->s_udfrev); if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFWriteRev)) lvidiu->minUDFWriteRev = cpu_to_le16(sbi->s_udfrev); - lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE); + if (!UDF_QUERY_FLAG(sb, UDF_FLAG_INCONSISTENT)) + lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE); lvid->descTag.descCRC = cpu_to_le16( crc_itu_t(0, (char *)lvid + sizeof(struct tag), @@ -2091,11 +2093,13 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) bool lvid_open = false; uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT); - uopt.uid = INVALID_UID; - uopt.gid = INVALID_GID; + /* By default we'll use overflow[ug]id when UDF inode [ug]id == -1 */ + uopt.uid = make_kuid(current_user_ns(), overflowuid); + uopt.gid = make_kgid(current_user_ns(), overflowgid); uopt.umask = 0; uopt.fmode = UDF_INVALID_MODE; uopt.dmode = UDF_INVALID_MODE; + uopt.nls_map = NULL; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) @@ -2276,8 +2280,8 @@ error_out: iput(sbi->s_vat_inode); parse_options_failure: #ifdef CONFIG_UDF_NLS - if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) - unload_nls(sbi->s_nls_map); + if (uopt.nls_map) + unload_nls(uopt.nls_map); #endif if (lvid_open) udf_close_lvid(sb); diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 68c9f1d618f5..9dd3e1b9619e 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -23,14 +23,13 @@ #define UDF_FLAG_NLS_MAP 9 #define UDF_FLAG_UTF8 10 #define UDF_FLAG_UID_FORGET 11 /* save -1 for uid to disk */ -#define UDF_FLAG_UID_IGNORE 12 /* use sb uid instead of on disk uid */ -#define UDF_FLAG_GID_FORGET 13 -#define UDF_FLAG_GID_IGNORE 14 -#define UDF_FLAG_UID_SET 15 -#define UDF_FLAG_GID_SET 16 -#define UDF_FLAG_SESSION_SET 17 -#define UDF_FLAG_LASTBLOCK_SET 18 -#define UDF_FLAG_BLOCKSIZE_SET 19 +#define UDF_FLAG_GID_FORGET 12 +#define UDF_FLAG_UID_SET 13 +#define UDF_FLAG_GID_SET 14 +#define UDF_FLAG_SESSION_SET 15 +#define UDF_FLAG_LASTBLOCK_SET 16 +#define UDF_FLAG_BLOCKSIZE_SET 17 +#define UDF_FLAG_INCONSISTENT 18 #define UDF_PART_FLAG_UNALLOC_BITMAP 0x0001 #define UDF_PART_FLAG_UNALLOC_TABLE 0x0002 diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index f5e0fe78979e..68e8a64d22e0 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -48,6 +48,8 @@ extern __printf(3, 4) void _udf_warn(struct super_block *sb, #define UDF_EXTENT_LENGTH_MASK 0x3FFFFFFF #define UDF_EXTENT_FLAG_MASK 0xC0000000 +#define UDF_INVALID_ID ((uint32_t)-1) + #define UDF_NAME_PAD 4 #define UDF_NAME_LEN 254 #define UDF_NAME_LEN_CS0 255 diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 39387bdd225d..4bcc095fe44a 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -1947,7 +1947,7 @@ void xfs_alloc_compute_maxlevels( xfs_mount_t *mp) /* file system mount structure */ { - mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_alloc_mnr, + mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr, (mp->m_sb.sb_agblocks + 1) / 2); } @@ -1959,7 +1959,6 @@ xfs_alloc_compute_maxlevels( */ xfs_extlen_t xfs_alloc_longest_free_extent( - struct xfs_mount *mp, struct xfs_perag *pag, xfs_extlen_t need, xfs_extlen_t reserved) @@ -2038,8 +2037,7 @@ xfs_alloc_space_available( /* do we have enough contiguous free space for the allocation? */ alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop; - longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free, - reservation); + longest = xfs_alloc_longest_free_extent(pag, min_free, reservation); if (longest < alloc_len) return false; diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index a311a2414a6b..cbf789ea5a4e 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -116,9 +116,8 @@ xfs_alloc_allow_busy_reuse(int datatype) unsigned int xfs_alloc_set_aside(struct xfs_mount *mp); unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp); -xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp, - struct xfs_perag *pag, xfs_extlen_t need, - xfs_extlen_t reserved); +xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_perag *pag, + xfs_extlen_t need, xfs_extlen_t reserved); unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp, struct xfs_perag *pag); diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 3b03d886df66..6a7c2f03ea11 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3225,7 +3225,7 @@ xfs_bmap_longest_free_extent( } } - longest = xfs_alloc_longest_free_extent(mp, pag, + longest = xfs_alloc_longest_free_extent(pag, xfs_alloc_min_freelist(mp, pag), xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE)); if (*blen < longest) @@ -5667,7 +5667,6 @@ xfs_bmap_collapse_extents( xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, bool *done, - xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock, struct xfs_defer_ops *dfops) { diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index f3be6416260b..2b766b37096d 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -228,7 +228,7 @@ void xfs_bmap_del_extent_cow(struct xfs_inode *ip, uint xfs_default_attroffset(struct xfs_inode *ip); int xfs_bmap_collapse_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, - bool *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock, + bool *done, xfs_fsblock_t *firstblock, struct xfs_defer_ops *dfops); int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index edc0193358a5..ac7d66427e42 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -4531,7 +4531,6 @@ xfs_btree_sblock_verify( */ uint xfs_btree_compute_maxlevels( - struct xfs_mount *mp, uint *limits, unsigned long len) { @@ -4839,7 +4838,6 @@ xfs_btree_query_all( */ xfs_extlen_t xfs_btree_calc_size( - struct xfs_mount *mp, uint *limits, unsigned long long len) { diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 58e30c0975c3..9227159a751e 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -481,10 +481,8 @@ xfs_failaddr_t xfs_btree_lblock_v5hdr_verify(struct xfs_buf *bp, xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp, unsigned int max_recs); -uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits, - unsigned long len); -xfs_extlen_t xfs_btree_calc_size(struct xfs_mount *mp, uint *limits, - unsigned long long len); +uint xfs_btree_compute_maxlevels(uint *limits, unsigned long len); +xfs_extlen_t xfs_btree_calc_size(uint *limits, unsigned long long len); /* return codes */ #define XFS_BTREE_QUERY_RANGE_CONTINUE 0 /* keep iterating */ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 0e2cf5f0be1f..de627fa19168 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2406,7 +2406,7 @@ xfs_ialloc_compute_maxlevels( uint inodes; inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG; - mp->m_in_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_inobt_mnr, + mp->m_in_maxlevels = xfs_btree_compute_maxlevels(mp->m_inobt_mnr, inodes); } diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index a2dd7f4a2719..367e9a0726e6 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -556,7 +556,7 @@ xfs_inobt_max_size( if (mp->m_inobt_mxr[0] == 0) return 0; - return xfs_btree_calc_size(mp, mp->m_inobt_mnr, + return xfs_btree_calc_size(mp->m_inobt_mnr, (uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock / XFS_INODES_PER_CHUNK); } diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index bee68c23d612..560e28473024 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -351,7 +351,6 @@ xfs_refcount_merge_center_extents( struct xfs_refcount_irec *center, struct xfs_refcount_irec *right, unsigned long long extlen, - xfs_agblock_t *agbno, xfs_extlen_t *aglen) { int error; @@ -471,7 +470,6 @@ xfs_refcount_merge_right_extent( struct xfs_btree_cur *cur, struct xfs_refcount_irec *right, struct xfs_refcount_irec *cright, - xfs_agblock_t *agbno, xfs_extlen_t *aglen) { int error; @@ -749,7 +747,7 @@ xfs_refcount_merge_extents( ulen < MAXREFCEXTLEN) { *shape_changed = true; return xfs_refcount_merge_center_extents(cur, &left, &cleft, - &right, ulen, agbno, aglen); + &right, ulen, aglen); } /* Try to merge left and cleft. */ @@ -778,7 +776,7 @@ xfs_refcount_merge_extents( ulen < MAXREFCEXTLEN) { *shape_changed = true; return xfs_refcount_merge_right_extent(cur, &right, &cright, - agbno, aglen); + aglen); } return error; @@ -1356,9 +1354,7 @@ xfs_refcount_adjust_cow_extents( struct xfs_btree_cur *cur, xfs_agblock_t agbno, xfs_extlen_t aglen, - enum xfs_refc_adjust_op adj, - struct xfs_defer_ops *dfops, - struct xfs_owner_info *oinfo) + enum xfs_refc_adjust_op adj) { struct xfs_refcount_irec ext, tmp; int error; @@ -1437,8 +1433,7 @@ xfs_refcount_adjust_cow( struct xfs_btree_cur *cur, xfs_agblock_t agbno, xfs_extlen_t aglen, - enum xfs_refc_adjust_op adj, - struct xfs_defer_ops *dfops) + enum xfs_refc_adjust_op adj) { bool shape_changed; int error; @@ -1465,8 +1460,7 @@ xfs_refcount_adjust_cow( goto out_error; /* Now that we've taken care of the ends, adjust the middle extents */ - error = xfs_refcount_adjust_cow_extents(cur, agbno, aglen, adj, - dfops, NULL); + error = xfs_refcount_adjust_cow_extents(cur, agbno, aglen, adj); if (error) goto out_error; @@ -1493,7 +1487,7 @@ __xfs_refcount_cow_alloc( /* Add refcount btree reservation */ return xfs_refcount_adjust_cow(rcur, agbno, aglen, - XFS_REFCOUNT_ADJUST_COW_ALLOC, dfops); + XFS_REFCOUNT_ADJUST_COW_ALLOC); } /* @@ -1511,7 +1505,7 @@ __xfs_refcount_cow_free( /* Remove refcount btree reservation */ return xfs_refcount_adjust_cow(rcur, agbno, aglen, - XFS_REFCOUNT_ADJUST_COW_FREE, dfops); + XFS_REFCOUNT_ADJUST_COW_FREE); } /* Record a CoW staging extent in the refcount btree. */ @@ -1568,7 +1562,7 @@ struct xfs_refcount_recovery { /* Stuff an extent on the recovery list. */ STATIC int xfs_refcount_recover_extent( - struct xfs_btree_cur *cur, + struct xfs_btree_cur *cur, union xfs_btree_rec *rec, void *priv) { diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 265fdcefcbae..375abfeb6267 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -373,7 +373,6 @@ xfs_refcountbt_init_cursor( */ int xfs_refcountbt_maxrecs( - struct xfs_mount *mp, int blocklen, bool leaf) { @@ -390,7 +389,7 @@ void xfs_refcountbt_compute_maxlevels( struct xfs_mount *mp) { - mp->m_refc_maxlevels = xfs_btree_compute_maxlevels(mp, + mp->m_refc_maxlevels = xfs_btree_compute_maxlevels( mp->m_refc_mnr, mp->m_sb.sb_agblocks); } @@ -400,7 +399,7 @@ xfs_refcountbt_calc_size( struct xfs_mount *mp, unsigned long long len) { - return xfs_btree_calc_size(mp, mp->m_refc_mnr, len); + return xfs_btree_calc_size(mp->m_refc_mnr, len); } /* diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h index 9db008b955b7..2bc4694ef146 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.h +++ b/fs/xfs/libxfs/xfs_refcount_btree.h @@ -60,8 +60,7 @@ struct xfs_mount; extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agnumber_t agno, struct xfs_defer_ops *dfops); -extern int xfs_refcountbt_maxrecs(struct xfs_mount *mp, int blocklen, - bool leaf); +extern int xfs_refcountbt_maxrecs(int blocklen, bool leaf); extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp); extern xfs_extlen_t xfs_refcountbt_calc_size(struct xfs_mount *mp, diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 79822cf6ebe3..fba8d2718017 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -376,7 +376,6 @@ xfs_rmap_free_check_owner( struct xfs_mount *mp, uint64_t ltoff, struct xfs_rmap_irec *rec, - xfs_fsblock_t bno, xfs_filblks_t len, uint64_t owner, uint64_t offset, @@ -519,7 +518,7 @@ xfs_rmap_unmap( bno + len, out_error); /* Check owner information. */ - error = xfs_rmap_free_check_owner(mp, ltoff, <rec, bno, len, owner, + error = xfs_rmap_free_check_owner(mp, ltoff, <rec, len, owner, offset, flags); if (error) goto out_error; diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 8b0d0de1cd11..d756e0b84abf 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -499,7 +499,6 @@ xfs_rmapbt_init_cursor( */ int xfs_rmapbt_maxrecs( - struct xfs_mount *mp, int blocklen, int leaf) { @@ -534,7 +533,7 @@ xfs_rmapbt_compute_maxlevels( if (xfs_sb_version_hasreflink(&mp->m_sb)) mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS; else - mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(mp, + mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels( mp->m_rmap_mnr, mp->m_sb.sb_agblocks); } @@ -544,7 +543,7 @@ xfs_rmapbt_calc_size( struct xfs_mount *mp, unsigned long long len) { - return xfs_btree_calc_size(mp, mp->m_rmap_mnr, len); + return xfs_btree_calc_size(mp->m_rmap_mnr, len); } /* diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h index 19c08e933049..d68d96eed7ea 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.h +++ b/fs/xfs/libxfs/xfs_rmap_btree.h @@ -55,7 +55,7 @@ struct xfs_mount; struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *bp, xfs_agnumber_t agno); -int xfs_rmapbt_maxrecs(struct xfs_mount *mp, int blocklen, int leaf); +int xfs_rmapbt_maxrecs(int blocklen, int leaf); extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp); extern xfs_extlen_t xfs_rmapbt_calc_size(struct xfs_mount *mp, diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 53433cc024fd..d9b94bd5f689 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -756,15 +756,13 @@ xfs_sb_mount_common( mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2; mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2; - mp->m_rmap_mxr[0] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, 1); - mp->m_rmap_mxr[1] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, 0); + mp->m_rmap_mxr[0] = xfs_rmapbt_maxrecs(sbp->sb_blocksize, 1); + mp->m_rmap_mxr[1] = xfs_rmapbt_maxrecs(sbp->sb_blocksize, 0); mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2; mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2; - mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, - true); - mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, - false); + mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(sbp->sb_blocksize, true); + mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(sbp->sb_blocksize, false); mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2; mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2; diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 5f17641f040f..3bccdf73e141 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -734,8 +734,7 @@ xfs_calc_clear_agi_bucket_reservation( * the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot) */ STATIC uint -xfs_calc_qm_setqlim_reservation( - struct xfs_mount *mp) +xfs_calc_qm_setqlim_reservation(void) { return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot)); } @@ -772,8 +771,7 @@ xfs_calc_qm_quotaoff_reservation( * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2 */ STATIC uint -xfs_calc_qm_quotaoff_end_reservation( - struct xfs_mount *mp) +xfs_calc_qm_quotaoff_end_reservation(void) { return sizeof(struct xfs_qoff_logitem) * 2; } @@ -877,14 +875,14 @@ xfs_trans_resv_calc( * The following transactions are logged in logical format with * a default log count. */ - resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(mp); + resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(); resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT; resp->tr_qm_quotaoff.tr_logres = xfs_calc_qm_quotaoff_reservation(mp); resp->tr_qm_quotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT; resp->tr_qm_equotaoff.tr_logres = - xfs_calc_qm_quotaoff_end_reservation(mp); + xfs_calc_qm_quotaoff_end_reservation(); resp->tr_qm_equotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT; resp->tr_sb.tr_logres = xfs_calc_sb_reservation(mp); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 19eadc807056..0ab824f574ed 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1195,16 +1195,22 @@ xfs_vm_writepages( int ret; xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); - if (dax_mapping(mapping)) - return dax_writeback_mapping_range(mapping, - xfs_find_bdev_for_inode(mapping->host), wbc); - ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc); if (wpc.ioend) ret = xfs_submit_ioend(wbc, wpc.ioend, ret); return ret; } +STATIC int +xfs_dax_writepages( + struct address_space *mapping, + struct writeback_control *wbc) +{ + xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); + return dax_writeback_mapping_range(mapping, + xfs_find_bdev_for_inode(mapping->host), wbc); +} + /* * Called to move a page into cleanable state - and from there * to be released. The page should already be clean. We always @@ -1367,17 +1373,6 @@ out_unlock: return error; } -STATIC ssize_t -xfs_vm_direct_IO( - struct kiocb *iocb, - struct iov_iter *iter) -{ - /* - * We just need the method present so that open/fcntl allow direct I/O. - */ - return -EINVAL; -} - STATIC sector_t xfs_vm_bmap( struct address_space *mapping, @@ -1390,7 +1385,7 @@ xfs_vm_bmap( /* * The swap code (ab-)uses ->bmap to get a block mapping and then - * bypasseѕ the file system for actual I/O. We really can't allow + * bypasses the file system for actual I/O. We really can't allow * that on reflinks inodes, so we have to skip out here. And yes, * 0 is the magic code for a bmap error. * @@ -1472,19 +1467,8 @@ xfs_vm_set_page_dirty( newly_dirty = !TestSetPageDirty(page); spin_unlock(&mapping->private_lock); - if (newly_dirty) { - /* sigh - __set_page_dirty() is static, so copy it here, too */ - unsigned long flags; - - spin_lock_irqsave(&mapping->tree_lock, flags); - if (page->mapping) { /* Race with truncate? */ - WARN_ON_ONCE(!PageUptodate(page)); - account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->page_tree, - page_index(page), PAGECACHE_TAG_DIRTY); - } - spin_unlock_irqrestore(&mapping->tree_lock, flags); - } + if (newly_dirty) + __set_page_dirty(page, mapping, 1); unlock_page_memcg(page); if (newly_dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); @@ -1500,8 +1484,15 @@ const struct address_space_operations xfs_address_space_operations = { .releasepage = xfs_vm_releasepage, .invalidatepage = xfs_vm_invalidatepage, .bmap = xfs_vm_bmap, - .direct_IO = xfs_vm_direct_IO, + .direct_IO = noop_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; + +const struct address_space_operations xfs_dax_aops = { + .writepages = xfs_dax_writepages, + .direct_IO = noop_direct_IO, + .set_page_dirty = noop_set_page_dirty, + .invalidatepage = noop_invalidatepage, +}; diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 88c85ea63da0..69346d460dfa 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -54,6 +54,7 @@ struct xfs_ioend { }; extern const struct address_space_operations xfs_address_space_operations; +extern const struct address_space_operations xfs_dax_aops; int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size); diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index e5fb008d75e8..2203465e63ea 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -53,6 +53,25 @@ xfs_bui_item_free( kmem_zone_free(xfs_bui_zone, buip); } +/* + * Freeing the BUI requires that we remove it from the AIL if it has already + * been placed there. However, the BUI may not yet have been placed in the AIL + * when called by xfs_bui_release() from BUD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the BUI. + */ +void +xfs_bui_release( + struct xfs_bui_log_item *buip) +{ + ASSERT(atomic_read(&buip->bui_refcount) > 0); + if (atomic_dec_and_test(&buip->bui_refcount)) { + xfs_trans_ail_remove(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR); + xfs_bui_item_free(buip); + } +} + + STATIC void xfs_bui_item_size( struct xfs_log_item *lip, @@ -142,7 +161,7 @@ xfs_bui_item_unlock( struct xfs_log_item *lip) { if (lip->li_flags & XFS_LI_ABORTED) - xfs_bui_item_free(BUI_ITEM(lip)); + xfs_bui_release(BUI_ITEM(lip)); } /* @@ -206,24 +225,6 @@ xfs_bui_init( return buip; } -/* - * Freeing the BUI requires that we remove it from the AIL if it has already - * been placed there. However, the BUI may not yet have been placed in the AIL - * when called by xfs_bui_release() from BUD processing due to the ordering of - * committed vs unpin operations in bulk insert operations. Hence the reference - * count to ensure only the last caller frees the BUI. - */ -void -xfs_bui_release( - struct xfs_bui_log_item *buip) -{ - ASSERT(atomic_read(&buip->bui_refcount) > 0); - if (atomic_dec_and_test(&buip->bui_refcount)) { - xfs_trans_ail_remove(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR); - xfs_bui_item_free(buip); - } -} - static inline struct xfs_bud_log_item *BUD_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_bud_log_item, bud_item); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 05dee8fdd895..8cd8c412f52d 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1326,7 +1326,6 @@ xfs_collapse_file_space( int error; struct xfs_defer_ops dfops; xfs_fsblock_t first_block; - xfs_fileoff_t stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size); xfs_fileoff_t next_fsb = XFS_B_TO_FSB(mp, offset + len); xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len); uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); @@ -1361,7 +1360,7 @@ xfs_collapse_file_space( xfs_defer_init(&dfops, &first_block); error = xfs_bmap_collapse_extents(tp, ip, &next_fsb, shift_fsb, - &done, stop_fsb, &first_block, &dfops); + &done, &first_block, &dfops); if (error) goto out_bmap_cancel; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index ac669a10c62f..55661cbdb51b 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1754,7 +1754,6 @@ xfs_buftarg_shrink_count( void xfs_free_buftarg( - struct xfs_mount *mp, struct xfs_buftarg *btp) { unregister_shrinker(&btp->bt_shrinker); diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 2f4c91452861..edced162a674 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -388,7 +388,7 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset) */ extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *, struct block_device *, struct dax_device *); -extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); +extern void xfs_free_buftarg(struct xfs_buftarg *); extern void xfs_wait_buftarg(xfs_buftarg_t *); extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int); diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index b2cde5426182..7b68e6c9a474 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -50,19 +50,19 @@ xfs_trim_extents( pag = xfs_perag_get(mp, agno); - error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); - if (error || !agbp) - goto out_put_perag; - - cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT); - /* * Force out the log. This means any transactions that might have freed - * space before we took the AGF buffer lock are now on disk, and the + * space before we take the AGF buffer lock are now on disk, and the * volatile disk cache is flushed. */ xfs_log_force(mp, XFS_LOG_SYNC); + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + if (error || !agbp) + goto out_put_perag; + + cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT); + /* * Look up the longest btree in the AGF and start with it. */ diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 761f3189eff2..eed698aa9f16 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -122,7 +122,7 @@ xfs_nfs_get_inode( struct super_block *sb, u64 ino, u32 generation) - { +{ xfs_mount_t *mp = XFS_M(sb); xfs_inode_t *ip; int error; diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 64da90655e95..b5b1e567b9f4 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -51,6 +51,24 @@ xfs_efi_item_free( } /* + * Freeing the efi requires that we remove it from the AIL if it has already + * been placed there. However, the EFI may not yet have been placed in the AIL + * when called by xfs_efi_release() from EFD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the EFI. + */ +void +xfs_efi_release( + struct xfs_efi_log_item *efip) +{ + ASSERT(atomic_read(&efip->efi_refcount) > 0); + if (atomic_dec_and_test(&efip->efi_refcount)) { + xfs_trans_ail_remove(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR); + xfs_efi_item_free(efip); + } +} + +/* * This returns the number of iovecs needed to log the given efi item. * We only need 1 iovec for an efi item. It just logs the efi_log_format * structure. @@ -151,7 +169,7 @@ xfs_efi_item_unlock( struct xfs_log_item *lip) { if (lip->li_flags & XFS_LI_ABORTED) - xfs_efi_item_free(EFI_ITEM(lip)); + xfs_efi_release(EFI_ITEM(lip)); } /* @@ -279,24 +297,6 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) return -EFSCORRUPTED; } -/* - * Freeing the efi requires that we remove it from the AIL if it has already - * been placed there. However, the EFI may not yet have been placed in the AIL - * when called by xfs_efi_release() from EFD processing due to the ordering of - * committed vs unpin operations in bulk insert operations. Hence the reference - * count to ensure only the last caller frees the EFI. - */ -void -xfs_efi_release( - struct xfs_efi_log_item *efip) -{ - ASSERT(atomic_read(&efip->efi_refcount) > 0); - if (atomic_dec_and_test(&efip->efi_refcount)) { - xfs_trans_ail_remove(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR); - xfs_efi_item_free(efip); - } -} - static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_efd_log_item, efd_item); diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 043ca3808ea2..3f8722e51dbe 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -34,7 +34,6 @@ struct xfs_fstrm_item { struct xfs_mru_cache_elem mru; - struct xfs_inode *ip; xfs_agnumber_t ag; /* AG in use for this directory */ }; @@ -122,14 +121,15 @@ xfs_filestream_put_ag( static void xfs_fstrm_free_func( + void *data, struct xfs_mru_cache_elem *mru) { + struct xfs_mount *mp = data; struct xfs_fstrm_item *item = container_of(mru, struct xfs_fstrm_item, mru); - xfs_filestream_put_ag(item->ip->i_mount, item->ag); - - trace_xfs_filestream_free(item->ip, item->ag); + xfs_filestream_put_ag(mp, item->ag); + trace_xfs_filestream_free(mp, mru->key, item->ag); kmem_free(item); } @@ -165,7 +165,7 @@ xfs_filestream_pick_ag( trylock = XFS_ALLOC_FLAG_TRYLOCK; for (nscan = 0; 1; nscan++) { - trace_xfs_filestream_scan(ip, ag); + trace_xfs_filestream_scan(mp, ip->i_ino, ag); pag = xfs_perag_get(mp, ag); @@ -198,7 +198,7 @@ xfs_filestream_pick_ag( goto next_ag; } - longest = xfs_alloc_longest_free_extent(mp, pag, + longest = xfs_alloc_longest_free_extent(pag, xfs_alloc_min_freelist(mp, pag), xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE)); if (((minlen && longest >= minlen) || @@ -265,7 +265,6 @@ next_ag: goto out_put_ag; item->ag = *agp; - item->ip = ip; err = xfs_mru_cache_insert(mp->m_filestream, ip->i_ino, &item->mru); if (err) { @@ -333,7 +332,7 @@ xfs_filestream_lookup_ag( ag = container_of(mru, struct xfs_fstrm_item, mru)->ag; xfs_mru_cache_done(mp->m_filestream); - trace_xfs_filestream_lookup(ip, ag); + trace_xfs_filestream_lookup(mp, ip->i_ino, ag); goto out; } @@ -399,7 +398,7 @@ xfs_filestream_new_ag( * Only free the item here so we skip over the old AG earlier. */ if (mru) - xfs_fstrm_free_func(mru); + xfs_fstrm_free_func(mp, mru); IRELE(pip); exit: @@ -426,8 +425,8 @@ xfs_filestream_mount( * timer tunable to within about 10 percent. This requires at least 10 * groups. */ - return xfs_mru_cache_create(&mp->m_filestream, xfs_fstrm_centisecs * 10, - 10, xfs_fstrm_free_func); + return xfs_mru_cache_create(&mp->m_filestream, mp, + xfs_fstrm_centisecs * 10, 10, xfs_fstrm_free_func); } void diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 3e3aab3888fa..2b70c8b4cee2 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -972,10 +972,8 @@ xfs_dir_ialloc( xfs_nlink_t nlink, dev_t rdev, prid_t prid, /* project id */ - xfs_inode_t **ipp, /* pointer to inode; it will be + xfs_inode_t **ipp) /* pointer to inode; it will be locked. */ - int *committed) - { xfs_trans_t *tp; xfs_inode_t *ip; @@ -1050,8 +1048,6 @@ xfs_dir_ialloc( } code = xfs_trans_roll(&tp); - if (committed != NULL) - *committed = 1; /* * Re-attach the quota info that we detached from prev trx. @@ -1088,9 +1084,6 @@ xfs_dir_ialloc( } ASSERT(!ialloc_context && ip); - } else { - if (committed != NULL) - *committed = 0; } *ipp = ip; @@ -1217,8 +1210,7 @@ xfs_create( * entry pointing to them, but a directory also the "." entry * pointing to itself. */ - error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid, &ip, - NULL); + error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid, &ip); if (error) goto out_trans_cancel; @@ -1309,7 +1301,6 @@ xfs_create( int xfs_create_tmpfile( struct xfs_inode *dp, - struct dentry *dentry, umode_t mode, struct xfs_inode **ipp) { @@ -1351,7 +1342,7 @@ xfs_create_tmpfile( if (error) goto out_trans_cancel; - error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip, NULL); + error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip); if (error) goto out_trans_cancel; @@ -1611,13 +1602,15 @@ xfs_itruncate_extents( goto out; } - /* Remove all pending CoW reservations. */ - error = xfs_reflink_cancel_cow_blocks(ip, &tp, first_unmap_block, - last_block, true); - if (error) - goto out; + if (whichfork == XFS_DATA_FORK) { + /* Remove all pending CoW reservations. */ + error = xfs_reflink_cancel_cow_blocks(ip, &tp, + first_unmap_block, last_block, true); + if (error) + goto out; - xfs_itruncate_clear_reflink_flags(ip); + xfs_itruncate_clear_reflink_flags(ip); + } /* * Always re-log the inode so that our permanent transaction can keep @@ -2903,7 +2896,7 @@ xfs_rename_alloc_whiteout( struct xfs_inode *tmpfile; int error; - error = xfs_create_tmpfile(dp, NULL, S_IFCHR | WHITEOUT_MODE, &tmpfile); + error = xfs_create_tmpfile(dp, S_IFCHR | WHITEOUT_MODE, &tmpfile); if (error) return error; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 132d8aa2afc4..1eebc53df7d7 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -393,8 +393,8 @@ int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode **ipp, struct xfs_name *ci_name); int xfs_create(struct xfs_inode *dp, struct xfs_name *name, umode_t mode, dev_t rdev, struct xfs_inode **ipp); -int xfs_create_tmpfile(struct xfs_inode *dp, struct dentry *dentry, - umode_t mode, struct xfs_inode **ipp); +int xfs_create_tmpfile(struct xfs_inode *dp, umode_t mode, + struct xfs_inode **ipp); int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode *ip); int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, @@ -431,7 +431,7 @@ xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t, xfs_nlink_t, dev_t, prid_t, - struct xfs_inode **, int *); + struct xfs_inode **); /* from xfs_file.c */ enum xfs_prealloc_flags { diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index e0307fbff911..a3ed3c811dfa 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -177,7 +177,7 @@ xfs_generic_create( if (!tmpfile) { error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); } else { - error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip); + error = xfs_create_tmpfile(XFS_I(dir), mode, &ip); } if (unlikely(error)) goto out_free_acl; @@ -1285,7 +1285,10 @@ xfs_setup_iops( case S_IFREG: inode->i_op = &xfs_inode_operations; inode->i_fop = &xfs_file_operations; - inode->i_mapping->a_ops = &xfs_address_space_operations; + if (IS_DAX(inode)) + inode->i_mapping->a_ops = &xfs_dax_aops; + else + inode->i_mapping->a_ops = &xfs_address_space_operations; break; case S_IFDIR: if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb)) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index b9c9c848146b..2fcd9ed5d075 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -560,7 +560,6 @@ xfs_log_done( */ int xfs_log_notify( - struct xfs_mount *mp, struct xlog_in_core *iclog, xfs_log_callback_t *cb) { diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 7e2d62922a16..fa8ad31d587f 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -141,8 +141,7 @@ int xfs_log_mount_cancel(struct xfs_mount *); xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp); void xfs_log_space_wake(struct xfs_mount *mp); -int xfs_log_notify(struct xfs_mount *mp, - struct xlog_in_core *iclog, +int xfs_log_notify(struct xlog_in_core *iclog, struct xfs_log_callback *callback_entry); int xfs_log_release_iclog(struct xfs_mount *mp, struct xlog_in_core *iclog); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index cb376ac8a595..4668403b1741 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -848,7 +848,7 @@ restart: /* attach all the transactions w/ busy extents to iclog */ ctx->log_cb.cb_func = xlog_cil_committed; ctx->log_cb.cb_arg = ctx; - error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb); + error = xfs_log_notify(commit_iclog, &ctx->log_cb); if (error) goto out_abort; diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index f8a674d7f092..70eea7ae2876 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -112,6 +112,7 @@ struct xfs_mru_cache { xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */ struct delayed_work work; /* Workqueue data for reaping. */ unsigned int queued; /* work has been queued */ + void *data; }; static struct workqueue_struct *xfs_mru_reap_wq; @@ -259,7 +260,7 @@ _xfs_mru_cache_clear_reap_list( list_for_each_entry_safe(elem, next, &tmp, list_node) { list_del_init(&elem->list_node); - mru->free_func(elem); + mru->free_func(mru->data, elem); } spin_lock(&mru->lock); @@ -326,6 +327,7 @@ xfs_mru_cache_uninit(void) int xfs_mru_cache_create( struct xfs_mru_cache **mrup, + void *data, unsigned int lifetime_ms, unsigned int grp_count, xfs_mru_cache_free_func_t free_func) @@ -369,7 +371,7 @@ xfs_mru_cache_create( mru->grp_time = grp_time; mru->free_func = free_func; - + mru->data = data; *mrup = mru; exit: @@ -492,7 +494,7 @@ xfs_mru_cache_delete( elem = xfs_mru_cache_remove(mru, key); if (elem) - mru->free_func(elem); + mru->free_func(mru->data, elem); } /* diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h index fb5245ba5ff7..b3f3fbdfcc47 100644 --- a/fs/xfs/xfs_mru_cache.h +++ b/fs/xfs/xfs_mru_cache.h @@ -26,13 +26,13 @@ struct xfs_mru_cache_elem { }; /* Function pointer type for callback to free a client's data pointer. */ -typedef void (*xfs_mru_cache_free_func_t)(struct xfs_mru_cache_elem *elem); +typedef void (*xfs_mru_cache_free_func_t)(void *, struct xfs_mru_cache_elem *); int xfs_mru_cache_init(void); void xfs_mru_cache_uninit(void); -int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms, - unsigned int grp_count, - xfs_mru_cache_free_func_t free_func); +int xfs_mru_cache_create(struct xfs_mru_cache **mrup, void *data, + unsigned int lifetime_ms, unsigned int grp_count, + xfs_mru_cache_free_func_t free_func); void xfs_mru_cache_destroy(struct xfs_mru_cache *mru); int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key, struct xfs_mru_cache_elem *elem); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 5b848f4b637f..ec39ae274c78 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -748,7 +748,6 @@ xfs_qm_qino_alloc( { xfs_trans_t *tp; int error; - int committed; bool need_alloc = true; *ip = NULL; @@ -788,8 +787,7 @@ xfs_qm_qino_alloc( return error; if (need_alloc) { - error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, ip, - &committed); + error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, ip); if (error) { xfs_trans_cancel(tp); return error; diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 7a39f40645f7..15c9393dd7a7 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -52,6 +52,25 @@ xfs_cui_item_free( kmem_zone_free(xfs_cui_zone, cuip); } +/* + * Freeing the CUI requires that we remove it from the AIL if it has already + * been placed there. However, the CUI may not yet have been placed in the AIL + * when called by xfs_cui_release() from CUD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the CUI. + */ +void +xfs_cui_release( + struct xfs_cui_log_item *cuip) +{ + ASSERT(atomic_read(&cuip->cui_refcount) > 0); + if (atomic_dec_and_test(&cuip->cui_refcount)) { + xfs_trans_ail_remove(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR); + xfs_cui_item_free(cuip); + } +} + + STATIC void xfs_cui_item_size( struct xfs_log_item *lip, @@ -141,7 +160,7 @@ xfs_cui_item_unlock( struct xfs_log_item *lip) { if (lip->li_flags & XFS_LI_ABORTED) - xfs_cui_item_free(CUI_ITEM(lip)); + xfs_cui_release(CUI_ITEM(lip)); } /* @@ -211,24 +230,6 @@ xfs_cui_init( return cuip; } -/* - * Freeing the CUI requires that we remove it from the AIL if it has already - * been placed there. However, the CUI may not yet have been placed in the AIL - * when called by xfs_cui_release() from CUD processing due to the ordering of - * committed vs unpin operations in bulk insert operations. Hence the reference - * count to ensure only the last caller frees the CUI. - */ -void -xfs_cui_release( - struct xfs_cui_log_item *cuip) -{ - ASSERT(atomic_read(&cuip->cui_refcount) > 0); - if (atomic_dec_and_test(&cuip->cui_refcount)) { - xfs_trans_ail_remove(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR); - xfs_cui_item_free(cuip); - } -} - static inline struct xfs_cud_log_item *CUD_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_cud_log_item, cud_item); diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 49d3124863a8..06a07846c9b3 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -52,6 +52,24 @@ xfs_rui_item_free( kmem_zone_free(xfs_rui_zone, ruip); } +/* + * Freeing the RUI requires that we remove it from the AIL if it has already + * been placed there. However, the RUI may not yet have been placed in the AIL + * when called by xfs_rui_release() from RUD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the RUI. + */ +void +xfs_rui_release( + struct xfs_rui_log_item *ruip) +{ + ASSERT(atomic_read(&ruip->rui_refcount) > 0); + if (atomic_dec_and_test(&ruip->rui_refcount)) { + xfs_trans_ail_remove(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR); + xfs_rui_item_free(ruip); + } +} + STATIC void xfs_rui_item_size( struct xfs_log_item *lip, @@ -141,7 +159,7 @@ xfs_rui_item_unlock( struct xfs_log_item *lip) { if (lip->li_flags & XFS_LI_ABORTED) - xfs_rui_item_free(RUI_ITEM(lip)); + xfs_rui_release(RUI_ITEM(lip)); } /* @@ -233,24 +251,6 @@ xfs_rui_copy_format( return 0; } -/* - * Freeing the RUI requires that we remove it from the AIL if it has already - * been placed there. However, the RUI may not yet have been placed in the AIL - * when called by xfs_rui_release() from RUD processing due to the ordering of - * committed vs unpin operations in bulk insert operations. Hence the reference - * count to ensure only the last caller frees the RUI. - */ -void -xfs_rui_release( - struct xfs_rui_log_item *ruip) -{ - ASSERT(atomic_read(&ruip->rui_refcount) > 0); - if (atomic_dec_and_test(&ruip->rui_refcount)) { - xfs_trans_ail_remove(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR); - xfs_rui_item_free(ruip); - } -} - static inline struct xfs_rud_log_item *RUD_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_rud_log_item, rud_item); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 612c1d5348b3..d71424052917 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -722,7 +722,7 @@ xfs_close_devices( struct block_device *logdev = mp->m_logdev_targp->bt_bdev; struct dax_device *dax_logdev = mp->m_logdev_targp->bt_daxdev; - xfs_free_buftarg(mp, mp->m_logdev_targp); + xfs_free_buftarg(mp->m_logdev_targp); xfs_blkdev_put(logdev); fs_put_dax(dax_logdev); } @@ -730,11 +730,11 @@ xfs_close_devices( struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev; struct dax_device *dax_rtdev = mp->m_rtdev_targp->bt_daxdev; - xfs_free_buftarg(mp, mp->m_rtdev_targp); + xfs_free_buftarg(mp->m_rtdev_targp); xfs_blkdev_put(rtdev); fs_put_dax(dax_rtdev); } - xfs_free_buftarg(mp, mp->m_ddev_targp); + xfs_free_buftarg(mp->m_ddev_targp); fs_put_dax(dax_ddev); } @@ -808,9 +808,9 @@ xfs_open_devices( out_free_rtdev_targ: if (mp->m_rtdev_targp) - xfs_free_buftarg(mp, mp->m_rtdev_targp); + xfs_free_buftarg(mp->m_rtdev_targp); out_free_ddev_targ: - xfs_free_buftarg(mp, mp->m_ddev_targp); + xfs_free_buftarg(mp->m_ddev_targp); out_close_rtdev: xfs_blkdev_put(rtdev); fs_put_dax(dax_rtdev); @@ -1247,7 +1247,6 @@ xfs_quiesce_attr( STATIC int xfs_test_remount_options( struct super_block *sb, - struct xfs_mount *mp, char *options) { int error = 0; @@ -1278,7 +1277,7 @@ xfs_fs_remount( int error; /* First, check for complete junk; i.e. invalid options */ - error = xfs_test_remount_options(sb, mp, options); + error = xfs_test_remount_options(sb, options); if (error) return error; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 2e9e793a8f9d..5b66ac12913c 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -264,7 +264,7 @@ xfs_symlink( * Allocate an inode for the symlink. */ error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, - prid, &ip, NULL); + prid, &ip); if (error) goto out_trans_cancel; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index a982c0b623d0..8955254b900e 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -506,8 +506,8 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release); DEFINE_BUF_ITEM_EVENT(xfs_trans_binval); DECLARE_EVENT_CLASS(xfs_filestream_class, - TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno), - TP_ARGS(ip, agno), + TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino, xfs_agnumber_t agno), + TP_ARGS(mp, ino, agno), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) @@ -515,10 +515,10 @@ DECLARE_EVENT_CLASS(xfs_filestream_class, __field(int, streams) ), TP_fast_assign( - __entry->dev = VFS_I(ip)->i_sb->s_dev; - __entry->ino = ip->i_ino; + __entry->dev = mp->m_super->s_dev; + __entry->ino = ino; __entry->agno = agno; - __entry->streams = xfs_filestream_peek_ag(ip->i_mount, agno); + __entry->streams = xfs_filestream_peek_ag(mp, agno); ), TP_printk("dev %d:%d ino 0x%llx agno %u streams %d", MAJOR(__entry->dev), MINOR(__entry->dev), @@ -528,8 +528,8 @@ DECLARE_EVENT_CLASS(xfs_filestream_class, ) #define DEFINE_FILESTREAM_EVENT(name) \ DEFINE_EVENT(xfs_filestream_class, name, \ - TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno), \ - TP_ARGS(ip, agno)) + TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino, xfs_agnumber_t agno), \ + TP_ARGS(mp, ino, agno)) DEFINE_FILESTREAM_EVENT(xfs_filestream_free); DEFINE_FILESTREAM_EVENT(xfs_filestream_lookup); DEFINE_FILESTREAM_EVENT(xfs_filestream_scan); |