diff options
Diffstat (limited to 'fs')
151 files changed, 2887 insertions, 1860 deletions
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index ff911e779651..be1e34adc3c6 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -52,10 +52,9 @@ */ struct p9_rdir { - struct mutex mutex; int head; int tail; - uint8_t *buf; + uint8_t buf[]; }; /** @@ -93,33 +92,12 @@ static void p9stat_init(struct p9_wstat *stbuf) * */ -static int v9fs_alloc_rdir_buf(struct file *filp, int buflen) +static struct p9_rdir *v9fs_alloc_rdir_buf(struct file *filp, int buflen) { - struct p9_rdir *rdir; - struct p9_fid *fid; - int err = 0; - - fid = filp->private_data; - if (!fid->rdir) { - rdir = kmalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL); - - if (rdir == NULL) { - err = -ENOMEM; - goto exit; - } - spin_lock(&filp->f_dentry->d_lock); - if (!fid->rdir) { - rdir->buf = (uint8_t *)rdir + sizeof(struct p9_rdir); - mutex_init(&rdir->mutex); - rdir->head = rdir->tail = 0; - fid->rdir = (void *) rdir; - rdir = NULL; - } - spin_unlock(&filp->f_dentry->d_lock); - kfree(rdir); - } -exit: - return err; + struct p9_fid *fid = filp->private_data; + if (!fid->rdir) + fid->rdir = kzalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL); + return fid->rdir; } /** @@ -145,20 +123,16 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) buflen = fid->clnt->msize - P9_IOHDRSZ; - err = v9fs_alloc_rdir_buf(filp, buflen); - if (err) - goto exit; - rdir = (struct p9_rdir *) fid->rdir; + rdir = v9fs_alloc_rdir_buf(filp, buflen); + if (!rdir) + return -ENOMEM; - err = mutex_lock_interruptible(&rdir->mutex); - if (err) - return err; - while (err == 0) { + while (1) { if (rdir->tail == rdir->head) { err = v9fs_file_readn(filp, rdir->buf, NULL, buflen, filp->f_pos); if (err <= 0) - goto unlock_and_exit; + return err; rdir->head = 0; rdir->tail = err; @@ -169,9 +143,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) rdir->tail - rdir->head, &st); if (err) { p9_debug(P9_DEBUG_VFS, "returned %d\n", err); - err = -EIO; p9stat_free(&st); - goto unlock_and_exit; + return -EIO; } reclen = st.size+2; @@ -180,19 +153,13 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) p9stat_free(&st); - if (over) { - err = 0; - goto unlock_and_exit; - } + if (over) + return 0; + rdir->head += reclen; filp->f_pos += reclen; } } - -unlock_and_exit: - mutex_unlock(&rdir->mutex); -exit: - return err; } /** @@ -218,21 +185,16 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, buflen = fid->clnt->msize - P9_READDIRHDRSZ; - err = v9fs_alloc_rdir_buf(filp, buflen); - if (err) - goto exit; - rdir = (struct p9_rdir *) fid->rdir; + rdir = v9fs_alloc_rdir_buf(filp, buflen); + if (!rdir) + return -ENOMEM; - err = mutex_lock_interruptible(&rdir->mutex); - if (err) - return err; - - while (err == 0) { + while (1) { if (rdir->tail == rdir->head) { err = p9_client_readdir(fid, rdir->buf, buflen, filp->f_pos); if (err <= 0) - goto unlock_and_exit; + return err; rdir->head = 0; rdir->tail = err; @@ -245,8 +207,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, &curdirent); if (err < 0) { p9_debug(P9_DEBUG_VFS, "returned %d\n", err); - err = -EIO; - goto unlock_and_exit; + return -EIO; } /* d_off in dirent structure tracks the offset into @@ -261,20 +222,13 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, curdirent.d_type); oldoffset = curdirent.d_off; - if (over) { - err = 0; - goto unlock_and_exit; - } + if (over) + return 0; filp->f_pos = curdirent.d_off; rdir->head += err; } } - -unlock_and_exit: - mutex_unlock(&rdir->mutex); -exit: - return err; } diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index c2483e97beee..7a3399767570 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -80,10 +80,6 @@ int v9fs_file_open(struct inode *inode, struct file *file) p9_client_clunk(fid); return err; } - if (file->f_flags & O_TRUNC) { - i_size_write(inode, 0); - inode->i_blocks = 0; - } if ((file->f_flags & O_APPEND) && (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses))) generic_file_llseek(file, 0, SEEK_END); diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 890bed538f9b..57d017ac68e4 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -192,9 +192,6 @@ int v9fs_uflags2omode(int uflags, int extended) break; } - if (uflags & O_TRUNC) - ret |= P9_OTRUNC; - if (extended) { if (uflags & O_EXCL) ret |= P9_OEXCL; diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 40895546e103..8d24ad66dfb8 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -186,7 +186,6 @@ static int v9fs_mapped_dotl_flags(int flags) { O_CREAT, P9_DOTL_CREATE }, { O_EXCL, P9_DOTL_EXCL }, { O_NOCTTY, P9_DOTL_NOCTTY }, - { O_TRUNC, P9_DOTL_TRUNC }, { O_APPEND, P9_DOTL_APPEND }, { O_NONBLOCK, P9_DOTL_NONBLOCK }, { O_DSYNC, P9_DOTL_DSYNC }, @@ -268,8 +267,14 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, } /* Only creates */ - if (!(flags & O_CREAT) || dentry->d_inode) - return finish_no_open(file, res); + if (!(flags & O_CREAT)) + return finish_no_open(file, res); + else if (dentry->d_inode) { + if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) + return -EEXIST; + else + return finish_no_open(file, res); + } v9ses = v9fs_inode2v9ses(dir); diff --git a/fs/Kconfig b/fs/Kconfig index cfe512fd1caf..780725a463b1 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -68,16 +68,6 @@ source "fs/quota/Kconfig" source "fs/autofs4/Kconfig" source "fs/fuse/Kconfig" -config CUSE - tristate "Character device in Userspace support" - depends on FUSE_FS - help - This FUSE extension allows character devices to be - implemented in userspace. - - If you want to develop or use userspace character device - based on CUSE, answer Y or M. - config GENERIC_ACL bool select FS_POSIX_ACL diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 0c42cdbabecf..49d0b43458b7 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -33,6 +33,7 @@ #include <linux/elf.h> #include <linux/utsname.h> #include <linux/coredump.h> +#include <linux/sched.h> #include <asm/uaccess.h> #include <asm/param.h> #include <asm/page.h> @@ -1320,8 +1321,11 @@ static void fill_prstatus(struct elf_prstatus *prstatus, cputime_to_timeval(cputime.utime, &prstatus->pr_utime); cputime_to_timeval(cputime.stime, &prstatus->pr_stime); } else { - cputime_to_timeval(p->utime, &prstatus->pr_utime); - cputime_to_timeval(p->stime, &prstatus->pr_stime); + cputime_t utime, stime; + + task_cputime(p, &utime, &stime); + cputime_to_timeval(utime, &prstatus->pr_utime); + cputime_to_timeval(stime, &prstatus->pr_stime); } cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime); cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index dc84732e554f..cb240dd3b402 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1375,8 +1375,11 @@ static void fill_prstatus(struct elf_prstatus *prstatus, cputime_to_timeval(cputime.utime, &prstatus->pr_utime); cputime_to_timeval(cputime.stime, &prstatus->pr_stime); } else { - cputime_to_timeval(p->utime, &prstatus->pr_utime); - cputime_to_timeval(p->stime, &prstatus->pr_stime); + cputime_t utime, stime; + + task_cputime(p, &utime, &stime); + cputime_to_timeval(utime, &prstatus->pr_utime); + cputime_to_timeval(stime, &prstatus->pr_stime); } cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime); cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 521e9d4424f6..5a3327b8f90d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3997,7 +3997,7 @@ again: * We make the other tasks wait for the flush only when we can flush * all things. */ - if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) { + if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { flushing = true; space_info->flush = 1; } @@ -4534,7 +4534,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) unsigned nr_extents = 0; int extra_reserve = 0; enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; - int ret; + int ret = 0; bool delalloc_lock = true; /* If we are a free space inode we need to not flush since we will be in @@ -4579,20 +4579,18 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) csum_bytes = BTRFS_I(inode)->csum_bytes; spin_unlock(&BTRFS_I(inode)->lock); - if (root->fs_info->quota_enabled) { + if (root->fs_info->quota_enabled) ret = btrfs_qgroup_reserve(root, num_bytes + nr_extents * root->leafsize); - if (ret) { - spin_lock(&BTRFS_I(inode)->lock); - calc_csum_metadata_size(inode, num_bytes, 0); - spin_unlock(&BTRFS_I(inode)->lock); - if (delalloc_lock) - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); - return ret; - } - } - ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); + /* + * ret != 0 here means the qgroup reservation failed, we go straight to + * the shared error handling then. + */ + if (ret == 0) + ret = reserve_metadata_bytes(root, block_rsv, + to_reserve, flush); + if (ret) { u64 to_free = 0; unsigned dropped; @@ -5560,7 +5558,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, int empty_cluster = 2 * 1024 * 1024; struct btrfs_space_info *space_info; int loop = 0; - int index = 0; + int index = __get_raid_index(data); int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ? RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; bool found_uncached_bg = false; @@ -6788,11 +6786,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, &wc->flags[level]); if (ret < 0) { btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; return ret; } BUG_ON(wc->refs[level] == 0); if (wc->refs[level] == 1) { btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; return 1; } } diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index f169d6b11d7f..fdb7a8db3b57 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -171,6 +171,10 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next) if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags)) return 0; + if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) || + test_bit(EXTENT_FLAG_LOGGING, &next->flags)) + return 0; + if (extent_map_end(prev) == next->start && prev->flags == next->flags && prev->bdev == next->bdev && @@ -255,7 +259,8 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, if (!em) goto out; - list_move(&em->list, &tree->modified_extents); + if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) + list_move(&em->list, &tree->modified_extents); em->generation = gen; clear_bit(EXTENT_FLAG_PINNED, &em->flags); em->mod_start = em->start; @@ -280,6 +285,13 @@ out: } +void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) +{ + clear_bit(EXTENT_FLAG_LOGGING, &em->flags); + if (em->in_tree) + try_merge_map(tree, em); +} + /** * add_extent_mapping - add new extent map to the extent tree * @tree: tree to insert new map in diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 922943ce29e8..c6598c89cff8 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -69,6 +69,7 @@ void free_extent_map(struct extent_map *em); int __init extent_map_init(void); void extent_map_exit(void); int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen); +void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em); struct extent_map *search_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); #endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index bd38cef42358..94aa53b38721 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -460,8 +460,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, if (!contig) offset = page_offset(bvec->bv_page) + bvec->bv_offset; - if (!contig && (offset >= ordered->file_offset + ordered->len || - offset < ordered->file_offset)) { + if (offset >= ordered->file_offset + ordered->len || + offset < ordered->file_offset) { unsigned long bytes_left; sums->len = this_sum_bytes; this_sum_bytes = 0; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 77061bf43edb..aeb84469d2c4 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -293,15 +293,24 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, struct btrfs_key key; struct btrfs_ioctl_defrag_range_args range; int num_defrag; + int index; + int ret; /* get the inode */ key.objectid = defrag->root; btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); key.offset = (u64)-1; + + index = srcu_read_lock(&fs_info->subvol_srcu); + inode_root = btrfs_read_fs_root_no_name(fs_info, &key); if (IS_ERR(inode_root)) { - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - return PTR_ERR(inode_root); + ret = PTR_ERR(inode_root); + goto cleanup; + } + if (btrfs_root_refs(&inode_root->root_item) == 0) { + ret = -ENOENT; + goto cleanup; } key.objectid = defrag->ino; @@ -309,9 +318,10 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, key.offset = 0; inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); if (IS_ERR(inode)) { - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - return PTR_ERR(inode); + ret = PTR_ERR(inode); + goto cleanup; } + srcu_read_unlock(&fs_info->subvol_srcu, index); /* do a chunk of defrag */ clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); @@ -346,6 +356,10 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, iput(inode); return 0; +cleanup: + srcu_read_unlock(&fs_info->subvol_srcu, index); + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + return ret; } /* @@ -1594,9 +1608,10 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, if (err < 0 && num_written > 0) num_written = err; } -out: + if (sync) atomic_dec(&BTRFS_I(inode)->sync_writers); +out: sb_end_write(inode->i_sb); current->backing_dev_info = NULL; return num_written ? num_written : err; @@ -2241,6 +2256,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) if (lockend <= lockstart) lockend = lockstart + root->sectorsize; + lockend--; len = lockend - lockstart + 1; len = max_t(u64, len, root->sectorsize); @@ -2307,9 +2323,12 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) } } - *offset = start; - free_extent_map(em); - break; + if (!test_bit(EXTENT_FLAG_PREALLOC, + &em->flags)) { + *offset = start; + free_extent_map(em); + break; + } } } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 59ea2e4349c9..0be7a8742a43 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1862,11 +1862,13 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *info; - int ret = 0; + int ret; + bool re_search = false; spin_lock(&ctl->tree_lock); again: + ret = 0; if (!bytes) goto out_lock; @@ -1879,17 +1881,17 @@ again: info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1, 0); if (!info) { - /* the tree logging code might be calling us before we - * have fully loaded the free space rbtree for this - * block group. So it is possible the entry won't - * be in the rbtree yet at all. The caching code - * will make sure not to put it in the rbtree if - * the logging code has pinned it. + /* + * If we found a partial bit of our free space in a + * bitmap but then couldn't find the other part this may + * be a problem, so WARN about it. */ + WARN_ON(re_search); goto out_lock; } } + re_search = false; if (!info->bitmap) { unlink_free_space(ctl, info); if (offset == info->offset) { @@ -1935,8 +1937,10 @@ again: } ret = remove_from_bitmap(ctl, info, &offset, &bytes); - if (ret == -EAGAIN) + if (ret == -EAGAIN) { + re_search = true; goto again; + } BUG_ON(ret); /* logic error */ out_lock: spin_unlock(&ctl->tree_lock); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 16d9e8e191e6..cc93b23ca352 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -88,7 +88,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, }; -static int btrfs_setsize(struct inode *inode, loff_t newsize); +static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct inode *inode); static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); static noinline int cow_file_range(struct inode *inode, @@ -2478,6 +2478,18 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) continue; } nr_truncate++; + + /* 1 for the orphan item deletion. */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + ret = btrfs_orphan_add(trans, inode); + btrfs_end_transaction(trans, root); + if (ret) + goto out; + ret = btrfs_truncate(inode); } else { nr_unlink++; @@ -3665,6 +3677,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) block_end - cur_offset, 0); if (IS_ERR(em)) { err = PTR_ERR(em); + em = NULL; break; } last_byte = min(extent_map_end(em), block_end); @@ -3748,16 +3761,27 @@ next: return err; } -static int btrfs_setsize(struct inode *inode, loff_t newsize) +static int btrfs_setsize(struct inode *inode, struct iattr *attr) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; loff_t oldsize = i_size_read(inode); + loff_t newsize = attr->ia_size; + int mask = attr->ia_valid; int ret; if (newsize == oldsize) return 0; + /* + * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a + * special case where we need to update the times despite not having + * these flags set. For all other operations the VFS set these flags + * explicitly if it wants a timestamp update. + */ + if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) + inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); + if (newsize > oldsize) { truncate_pagecache(inode, oldsize, newsize); ret = btrfs_cont_expand(inode, oldsize, newsize); @@ -3783,9 +3807,34 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize) set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, &BTRFS_I(inode)->runtime_flags); + /* + * 1 for the orphan item we're going to add + * 1 for the orphan item deletion. + */ + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + /* + * We need to do this in case we fail at _any_ point during the + * actual truncate. Once we do the truncate_setsize we could + * invalidate pages which forces any outstanding ordered io to + * be instantly completed which will give us extents that need + * to be truncated. If we fail to get an orphan inode down we + * could have left over extents that were never meant to live, + * so we need to garuntee from this point on that everything + * will be consistent. + */ + ret = btrfs_orphan_add(trans, inode); + btrfs_end_transaction(trans, root); + if (ret) + return ret; + /* we don't support swapfiles, so vmtruncate shouldn't fail */ truncate_setsize(inode, newsize); ret = btrfs_truncate(inode); + if (ret && inode->i_nlink) + btrfs_orphan_del(NULL, inode); } return ret; @@ -3805,7 +3854,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) return err; if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { - err = btrfs_setsize(inode, attr->ia_size); + err = btrfs_setsize(inode, attr); if (err) return err; } @@ -5572,10 +5621,13 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag return em; if (em) { /* - * if our em maps to a hole, there might - * actually be delalloc bytes behind it + * if our em maps to + * - a hole or + * - a pre-alloc extent, + * there might actually be delalloc bytes behind it. */ - if (em->block_start != EXTENT_MAP_HOLE) + if (em->block_start != EXTENT_MAP_HOLE && + !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) return em; else hole_em = em; @@ -5657,6 +5709,8 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag */ em->block_start = hole_em->block_start; em->block_len = hole_len; + if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags)) + set_bit(EXTENT_FLAG_PREALLOC, &em->flags); } else { em->start = range_start; em->len = found; @@ -6915,11 +6969,9 @@ static int btrfs_truncate(struct inode *inode) /* * 1 for the truncate slack space - * 1 for the orphan item we're going to add - * 1 for the orphan item deletion * 1 for updating the inode. */ - trans = btrfs_start_transaction(root, 4); + trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out; @@ -6930,12 +6982,6 @@ static int btrfs_truncate(struct inode *inode) min_size); BUG_ON(ret); - ret = btrfs_orphan_add(trans, inode); - if (ret) { - btrfs_end_transaction(trans, root); - goto out; - } - /* * setattr is responsible for setting the ordered_data_close flag, * but that is only tested during the last file release. That @@ -7004,12 +7050,6 @@ static int btrfs_truncate(struct inode *inode) ret = btrfs_orphan_del(trans, inode); if (ret) err = ret; - } else if (ret && inode->i_nlink > 0) { - /* - * Failed to do the truncate, remove us from the in memory - * orphan list. - */ - ret = btrfs_orphan_del(NULL, inode); } if (trans) { @@ -7531,41 +7571,61 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) */ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) { - struct list_head *head = &root->fs_info->delalloc_inodes; struct btrfs_inode *binode; struct inode *inode; struct btrfs_delalloc_work *work, *next; struct list_head works; + struct list_head splice; int ret = 0; if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; INIT_LIST_HEAD(&works); - + INIT_LIST_HEAD(&splice); +again: spin_lock(&root->fs_info->delalloc_lock); - while (!list_empty(head)) { - binode = list_entry(head->next, struct btrfs_inode, + list_splice_init(&root->fs_info->delalloc_inodes, &splice); + while (!list_empty(&splice)) { + binode = list_entry(splice.next, struct btrfs_inode, delalloc_inodes); + + list_del_init(&binode->delalloc_inodes); + inode = igrab(&binode->vfs_inode); if (!inode) - list_del_init(&binode->delalloc_inodes); + continue; + + list_add_tail(&binode->delalloc_inodes, + &root->fs_info->delalloc_inodes); spin_unlock(&root->fs_info->delalloc_lock); - if (inode) { - work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); - if (!work) { - ret = -ENOMEM; - goto out; - } - list_add_tail(&work->list, &works); - btrfs_queue_worker(&root->fs_info->flush_workers, - &work->work); + + work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); + if (unlikely(!work)) { + ret = -ENOMEM; + goto out; } + list_add_tail(&work->list, &works); + btrfs_queue_worker(&root->fs_info->flush_workers, + &work->work); + cond_resched(); spin_lock(&root->fs_info->delalloc_lock); } spin_unlock(&root->fs_info->delalloc_lock); + list_for_each_entry_safe(work, next, &works, list) { + list_del_init(&work->list); + btrfs_wait_and_free_delalloc_work(work); + } + + spin_lock(&root->fs_info->delalloc_lock); + if (!list_empty(&root->fs_info->delalloc_inodes)) { + spin_unlock(&root->fs_info->delalloc_lock); + goto again; + } + spin_unlock(&root->fs_info->delalloc_lock); + /* the filemap_flush will queue IO into the worker threads, but * we have to make sure the IO is actually started and that * ordered extents get created before we return @@ -7578,11 +7638,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) atomic_read(&root->fs_info->async_delalloc_pages) == 0)); } atomic_dec(&root->fs_info->async_submit_draining); + return 0; out: list_for_each_entry_safe(work, next, &works, list) { list_del_init(&work->list); btrfs_wait_and_free_delalloc_work(work); } + + if (!list_empty_careful(&splice)) { + spin_lock(&root->fs_info->delalloc_lock); + list_splice_tail(&splice, &root->fs_info->delalloc_inodes); + spin_unlock(&root->fs_info->delalloc_lock); + } return ret; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4b4516770f05..338f2597bf7f 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -515,7 +515,6 @@ static noinline int create_subvol(struct btrfs_root *root, BUG_ON(ret); - d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); fail: if (async_transid) { *async_transid = trans->transid; @@ -525,6 +524,10 @@ fail: } if (err && !ret) ret = err; + + if (!ret) + d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); + return ret; } @@ -1339,7 +1342,8 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); - return -EINPROGRESS; + mnt_drop_write_file(file); + return -EINVAL; } mutex_lock(&root->fs_info->volume_mutex); @@ -1362,6 +1366,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, printk(KERN_INFO "btrfs: resizing devid %llu\n", (unsigned long long)devid); } + device = btrfs_find_device(root->fs_info, devid, NULL, NULL); if (!device) { printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", @@ -1369,9 +1374,10 @@ static noinline int btrfs_ioctl_resize(struct file *file, ret = -EINVAL; goto out_free; } - if (device->fs_devices && device->fs_devices->seeding) { + + if (!device->writeable) { printk(KERN_INFO "btrfs: resizer unable to apply on " - "seeding device %llu\n", + "readonly device %llu\n", (unsigned long long)devid); ret = -EINVAL; goto out_free; @@ -1443,8 +1449,8 @@ out_free: kfree(vol_args); out: mutex_unlock(&root->fs_info->volume_mutex); - mnt_drop_write_file(file); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + mnt_drop_write_file(file); return ret; } @@ -2095,13 +2101,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, err = inode_permission(inode, MAY_WRITE | MAY_EXEC); if (err) goto out_dput; - - /* check if subvolume may be deleted by a non-root user */ - err = btrfs_may_delete(dir, dentry, 1); - if (err) - goto out_dput; } + /* check if subvolume may be deleted by a user */ + err = btrfs_may_delete(dir, dentry, 1); + if (err) + goto out_dput; + if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) { err = -EINVAL; goto out_dput; @@ -2183,19 +2189,20 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) struct btrfs_ioctl_defrag_range_args *range; int ret; - if (btrfs_root_readonly(root)) - return -EROFS; + ret = mnt_want_write_file(file); + if (ret) + return ret; if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); - return -EINPROGRESS; + mnt_drop_write_file(file); + return -EINVAL; } - ret = mnt_want_write_file(file); - if (ret) { - atomic_set(&root->fs_info->mutually_exclusive_operation_running, - 0); - return ret; + + if (btrfs_root_readonly(root)) { + ret = -EROFS; + goto out; } switch (inode->i_mode & S_IFMT) { @@ -2247,8 +2254,8 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) ret = -EINVAL; } out: - mnt_drop_write_file(file); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + mnt_drop_write_file(file); return ret; } @@ -2263,7 +2270,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); - return -EINPROGRESS; + return -EINVAL; } mutex_lock(&root->fs_info->volume_mutex); @@ -2300,7 +2307,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); mnt_drop_write_file(file); - return -EINPROGRESS; + return -EINVAL; } mutex_lock(&root->fs_info->volume_mutex); @@ -2316,8 +2323,8 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) kfree(vol_args); out: mutex_unlock(&root->fs_info->volume_mutex); - mnt_drop_write_file(file); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + mnt_drop_write_file(file); return ret; } @@ -3437,8 +3444,8 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_ioctl_balance_args *bargs; struct btrfs_balance_control *bctl; + bool need_unlock; /* for mut. excl. ops lock */ int ret; - int need_to_clear_lock = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -3447,14 +3454,61 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) if (ret) return ret; - mutex_lock(&fs_info->volume_mutex); +again: + if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { + mutex_lock(&fs_info->volume_mutex); + mutex_lock(&fs_info->balance_mutex); + need_unlock = true; + goto locked; + } + + /* + * mut. excl. ops lock is locked. Three possibilites: + * (1) some other op is running + * (2) balance is running + * (3) balance is paused -- special case (think resume) + */ mutex_lock(&fs_info->balance_mutex); + if (fs_info->balance_ctl) { + /* this is either (2) or (3) */ + if (!atomic_read(&fs_info->balance_running)) { + mutex_unlock(&fs_info->balance_mutex); + if (!mutex_trylock(&fs_info->volume_mutex)) + goto again; + mutex_lock(&fs_info->balance_mutex); + + if (fs_info->balance_ctl && + !atomic_read(&fs_info->balance_running)) { + /* this is (3) */ + need_unlock = false; + goto locked; + } + + mutex_unlock(&fs_info->balance_mutex); + mutex_unlock(&fs_info->volume_mutex); + goto again; + } else { + /* this is (2) */ + mutex_unlock(&fs_info->balance_mutex); + ret = -EINPROGRESS; + goto out; + } + } else { + /* this is (1) */ + mutex_unlock(&fs_info->balance_mutex); + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + ret = -EINVAL; + goto out; + } + +locked: + BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running)); if (arg) { bargs = memdup_user(arg, sizeof(*bargs)); if (IS_ERR(bargs)) { ret = PTR_ERR(bargs); - goto out; + goto out_unlock; } if (bargs->flags & BTRFS_BALANCE_RESUME) { @@ -3474,13 +3528,10 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) bargs = NULL; } - if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, - 1)) { - pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + if (fs_info->balance_ctl) { ret = -EINPROGRESS; goto out_bargs; } - need_to_clear_lock = 1; bctl = kzalloc(sizeof(*bctl), GFP_NOFS); if (!bctl) { @@ -3501,11 +3552,17 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) } do_balance: - ret = btrfs_balance(bctl, bargs); /* - * bctl is freed in __cancel_balance or in free_fs_info if - * restriper was paused all the way until unmount + * Ownership of bctl and mutually_exclusive_operation_running + * goes to to btrfs_balance. bctl is freed in __cancel_balance, + * or, if restriper was paused all the way until unmount, in + * free_fs_info. mutually_exclusive_operation_running is + * cleared in __cancel_balance. */ + need_unlock = false; + + ret = btrfs_balance(bctl, bargs); + if (arg) { if (copy_to_user(arg, bargs, sizeof(*bargs))) ret = -EFAULT; @@ -3513,12 +3570,12 @@ do_balance: out_bargs: kfree(bargs); -out: - if (need_to_clear_lock) - atomic_set(&root->fs_info->mutually_exclusive_operation_running, - 0); +out_unlock: mutex_unlock(&fs_info->balance_mutex); mutex_unlock(&fs_info->volume_mutex); + if (need_unlock) + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); +out: mnt_drop_write_file(file); return ret; } @@ -3698,6 +3755,11 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) goto drop_write; } + if (!sa->qgroupid) { + ret = -EINVAL; + goto out; + } + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index f10731297040..e5ed56729607 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -836,9 +836,16 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, * if the disk i_size is already at the inode->i_size, or * this ordered extent is inside the disk i_size, we're done */ - if (disk_i_size == i_size || offset <= disk_i_size) { + if (disk_i_size == i_size) + goto out; + + /* + * We still need to update disk_i_size if outstanding_isize is greater + * than disk_i_size. + */ + if (offset <= disk_i_size && + (!ordered || ordered->outstanding_isize <= disk_i_size)) goto out; - } /* * walk backward from this ordered extent to disk_i_size. @@ -870,7 +877,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, break; if (test->file_offset >= i_size) break; - if (test->file_offset >= disk_i_size) { + if (entry_end(test) > disk_i_size) { /* * we don't update disk_i_size now, so record this * undealt i_size. Or we will not know the real diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index fe9d02c45f8e..a5c856234323 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -379,6 +379,13 @@ next1: ret = add_relation_rb(fs_info, found_key.objectid, found_key.offset); + if (ret == -ENOENT) { + printk(KERN_WARNING + "btrfs: orphan qgroup relation 0x%llx->0x%llx\n", + (unsigned long long)found_key.objectid, + (unsigned long long)found_key.offset); + ret = 0; /* ignore the error */ + } if (ret) goto out; next2: @@ -956,17 +963,28 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 qgroupid) { struct btrfs_root *quota_root; + struct btrfs_qgroup *qgroup; int ret = 0; quota_root = fs_info->quota_root; if (!quota_root) return -EINVAL; + /* check if there are no relations to this qgroup */ + spin_lock(&fs_info->qgroup_lock); + qgroup = find_qgroup_rb(fs_info, qgroupid); + if (qgroup) { + if (!list_empty(&qgroup->groups) || !list_empty(&qgroup->members)) { + spin_unlock(&fs_info->qgroup_lock); + return -EBUSY; + } + } + spin_unlock(&fs_info->qgroup_lock); + ret = del_qgroup_item(trans, quota_root, qgroupid); spin_lock(&fs_info->qgroup_lock); del_qgroup_rb(quota_root->fs_info, qgroupid); - spin_unlock(&fs_info->qgroup_lock); return ret; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index bdbb94f245c9..67783e03d121 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -580,20 +580,29 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) int corrected = 0; struct btrfs_key key; struct inode *inode = NULL; + struct btrfs_fs_info *fs_info; u64 end = offset + PAGE_SIZE - 1; struct btrfs_root *local_root; + int srcu_index; key.objectid = root; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key); - if (IS_ERR(local_root)) + + fs_info = fixup->root->fs_info; + srcu_index = srcu_read_lock(&fs_info->subvol_srcu); + + local_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(local_root)) { + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); return PTR_ERR(local_root); + } key.type = BTRFS_INODE_ITEM_KEY; key.objectid = inum; key.offset = 0; - inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL); + inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -606,7 +615,6 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) } if (PageUptodate(page)) { - struct btrfs_fs_info *fs_info; if (PageDirty(page)) { /* * we need to write the data to the defect sector. the @@ -3180,18 +3188,25 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) u64 physical_for_dev_replace; u64 len; struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; + int srcu_index; key.objectid = root; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; + + srcu_index = srcu_read_lock(&fs_info->subvol_srcu); + local_root = btrfs_read_fs_root_no_name(fs_info, &key); - if (IS_ERR(local_root)) + if (IS_ERR(local_root)) { + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); return PTR_ERR(local_root); + } key.type = BTRFS_INODE_ITEM_KEY; key.objectid = inum; key.offset = 0; inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); if (IS_ERR(inode)) return PTR_ERR(inode); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 54454542ad40..321b7fb4e441 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1814,8 +1814,10 @@ static int name_cache_insert(struct send_ctx *sctx, (unsigned long)nce->ino); if (!nce_head) { nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS); - if (!nce_head) + if (!nce_head) { + kfree(nce); return -ENOMEM; + } INIT_LIST_HEAD(nce_head); ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 99545df1b86c..d8982e9601d3 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -267,7 +267,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, function, line, errstr); return; } - trans->transaction->aborted = errno; + ACCESS_ONCE(trans->transaction->aborted) = errno; __btrfs_std_error(root->fs_info, function, line, errno, NULL); } /* diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 87fac9a21ea5..fc03aa60b684 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -333,12 +333,14 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type, &root->fs_info->trans_block_rsv, num_bytes, flush); if (ret) - return ERR_PTR(ret); + goto reserve_fail; } again: h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); - if (!h) - return ERR_PTR(-ENOMEM); + if (!h) { + ret = -ENOMEM; + goto alloc_fail; + } /* * If we are JOIN_NOLOCK we're already committing a transaction and @@ -365,11 +367,7 @@ again: if (ret < 0) { /* We must get the transaction if we are JOIN_NOLOCK. */ BUG_ON(type == TRANS_JOIN_NOLOCK); - - if (type < TRANS_JOIN_NOLOCK) - sb_end_intwrite(root->fs_info->sb); - kmem_cache_free(btrfs_trans_handle_cachep, h); - return ERR_PTR(ret); + goto join_fail; } cur_trans = root->fs_info->running_transaction; @@ -410,6 +408,19 @@ got_it: if (!current->journal_info && type != TRANS_USERSPACE) current->journal_info = h; return h; + +join_fail: + if (type < TRANS_JOIN_NOLOCK) + sb_end_intwrite(root->fs_info->sb); + kmem_cache_free(btrfs_trans_handle_cachep, h); +alloc_fail: + if (num_bytes) + btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv, + num_bytes); +reserve_fail: + if (qgroup_reserved) + btrfs_qgroup_free(root, qgroup_reserved); + return ERR_PTR(ret); } struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, @@ -1468,7 +1479,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, goto cleanup_transaction; } - if (cur_trans->aborted) { + /* Stop the commit early if ->aborted is set */ + if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { ret = cur_trans->aborted; goto cleanup_transaction; } @@ -1574,6 +1586,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); + /* ->aborted might be set after the previous check, so check it */ + if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { + ret = cur_trans->aborted; + goto cleanup_transaction; + } /* * the reloc mutex makes sure that we stop * the balancing code from coming in and moving @@ -1657,6 +1674,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, goto cleanup_transaction; } + /* + * The tasks which save the space cache and inode cache may also + * update ->aborted, check it. + */ + if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { + ret = cur_trans->aborted; + mutex_unlock(&root->fs_info->tree_log_mutex); + mutex_unlock(&root->fs_info->reloc_mutex); + goto cleanup_transaction; + } + btrfs_prepare_extent_commit(trans, root); cur_trans = root->fs_info->running_transaction; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 83186c7e45d4..9027bb1e7466 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3357,6 +3357,11 @@ static int log_one_extent(struct btrfs_trans_handle *trans, if (skip_csum) return 0; + if (em->compress_type) { + csum_offset = 0; + csum_len = block_len; + } + /* block start is already adjusted for the file extent offset. */ ret = btrfs_lookup_csums_range(log->fs_info->csum_root, em->block_start + csum_offset, @@ -3410,13 +3415,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, em = list_entry(extents.next, struct extent_map, list); list_del_init(&em->list); - clear_bit(EXTENT_FLAG_LOGGING, &em->flags); /* * If we had an error we just need to delete everybody from our * private list. */ if (ret) { + clear_em_logging(tree, em); free_extent_map(em); continue; } @@ -3424,8 +3429,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, write_unlock(&tree->lock); ret = log_one_extent(trans, inode, root, em, path); - free_extent_map(em); write_lock(&tree->lock); + clear_em_logging(tree, em); + free_extent_map(em); } WARN_ON(!list_empty(&extents)); write_unlock(&tree->lock); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5cce6aa74012..5cbb7f4b1672 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1431,7 +1431,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) } } else { ret = btrfs_get_bdev_and_sb(device_path, - FMODE_READ | FMODE_EXCL, + FMODE_WRITE | FMODE_EXCL, root->fs_info->bdev_holder, 0, &bdev, &bh); if (ret) @@ -1556,7 +1556,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) ret = 0; /* Notify udev that device has changed */ - btrfs_kobject_uevent(bdev, KOBJ_CHANGE); + if (bdev) + btrfs_kobject_uevent(bdev, KOBJ_CHANGE); error_brelse: brelse(bh); @@ -2614,7 +2615,14 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, cache = btrfs_lookup_block_group(fs_info, chunk_offset); chunk_used = btrfs_block_group_used(&cache->item); - user_thresh = div_factor_fine(cache->key.offset, bargs->usage); + if (bargs->usage == 0) + user_thresh = 0; + else if (bargs->usage > 100) + user_thresh = cache->key.offset; + else + user_thresh = div_factor_fine(cache->key.offset, + bargs->usage); + if (chunk_used < user_thresh) ret = 0; @@ -2959,6 +2967,8 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info) unset_balance_control(fs_info); ret = del_balance_item(fs_info->tree_root); BUG_ON(ret); + + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); } void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, @@ -3138,8 +3148,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl, out: if (bctl->flags & BTRFS_BALANCE_RESUME) __cancel_balance(fs_info); - else + else { kfree(bctl); + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + } return ret; } @@ -3156,7 +3168,6 @@ static int balance_kthread(void *data) ret = btrfs_balance(fs_info->balance_ctl, NULL); } - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); mutex_unlock(&fs_info->balance_mutex); mutex_unlock(&fs_info->volume_mutex); @@ -3179,7 +3190,6 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) return 0; } - WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)); tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); if (IS_ERR(tsk)) return PTR_ERR(tsk); @@ -3233,6 +3243,8 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) btrfs_balance_sys(leaf, item, &disk_bargs); btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); + WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)); + mutex_lock(&fs_info->volume_mutex); mutex_lock(&fs_info->balance_mutex); @@ -3496,7 +3508,7 @@ struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { { 1, 1, 2, 2, 2, 2 /* raid1 */ }, { 1, 2, 1, 1, 1, 2 /* dup */ }, { 1, 1, 0, 2, 1, 1 /* raid0 */ }, - { 1, 1, 0, 1, 1, 1 /* single */ }, + { 1, 1, 1, 1, 1, 1 /* single */ }, }; static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, diff --git a/fs/buffer.c b/fs/buffer.c index c017a2dfb909..7a75c3e0fd58 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2935,6 +2935,7 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh) void *kaddr = kmap_atomic(bh->b_page); memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes); kunmap_atomic(kaddr); + flush_dcache_page(bh->b_page); } } diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index ce5cbd717bfc..210fce2df308 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -226,6 +226,8 @@ compose_mount_options_out: compose_mount_options_err: kfree(mountdata); mountdata = ERR_PTR(rc); + kfree(*devname); + *devname = NULL; goto compose_mount_options_out; } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index f653835d067b..de7f9168a118 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -228,7 +228,6 @@ cifs_alloc_inode(struct super_block *sb) cifs_set_oplock_level(cifs_inode, 0); cifs_inode->delete_pending = false; cifs_inode->invalid_mapping = false; - cifs_inode->leave_pages_clean = false; cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ cifs_inode->server_eof = 0; cifs_inode->uniqueid = 0; diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index aea1eec64911..e6899cea1c35 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -386,6 +386,7 @@ struct smb_version_values { unsigned int cap_unix; unsigned int cap_nt_find; unsigned int cap_large_files; + unsigned int oplock_read; }; #define HEADER_SIZE(server) (server->vals->header_size) @@ -1030,7 +1031,6 @@ struct cifsInodeInfo { bool clientCanCacheAll; /* read and writebehind oplock */ bool delete_pending; /* DELETE_ON_CLOSE is set */ bool invalid_mapping; /* pagecache is invalid */ - bool leave_pages_clean; /* protected by i_mutex, not set pages dirty */ unsigned long time; /* jiffies of last update of inode */ u64 server_eof; /* current file size on server -- protected by i_lock */ u64 uniqueid; /* server inode number */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 17c3643e5950..12b3da39733b 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1917,7 +1917,7 @@ srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs) } case AF_INET6: { struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr; - struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)&rhs; + struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)rhs; return ipv6_addr_equal(&saddr6->sin6_addr, &vaddr6->sin6_addr); } default: diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 0a6677ba212b..8ea6ca50a665 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -238,6 +238,23 @@ out: return rc; } +static bool +cifs_has_mand_locks(struct cifsInodeInfo *cinode) +{ + struct cifs_fid_locks *cur; + bool has_locks = false; + + down_read(&cinode->lock_sem); + list_for_each_entry(cur, &cinode->llist, llist) { + if (!list_empty(&cur->locks)) { + has_locks = true; + break; + } + } + up_read(&cinode->lock_sem); + return has_locks; +} + struct cifsFileInfo * cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, struct tcon_link *tlink, __u32 oplock) @@ -248,6 +265,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, struct cifsFileInfo *cfile; struct cifs_fid_locks *fdlocks; struct cifs_tcon *tcon = tlink_tcon(tlink); + struct TCP_Server_Info *server = tcon->ses->server; cfile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); if (cfile == NULL) @@ -276,12 +294,22 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, INIT_WORK(&cfile->oplock_break, cifs_oplock_break); mutex_init(&cfile->fh_mutex); + /* + * If the server returned a read oplock and we have mandatory brlocks, + * set oplock level to None. + */ + if (oplock == server->vals->oplock_read && + cifs_has_mand_locks(cinode)) { + cFYI(1, "Reset oplock val from read to None due to mand locks"); + oplock = 0; + } + spin_lock(&cifs_file_list_lock); - if (fid->pending_open->oplock != CIFS_OPLOCK_NO_CHANGE) + if (fid->pending_open->oplock != CIFS_OPLOCK_NO_CHANGE && oplock) oplock = fid->pending_open->oplock; list_del(&fid->pending_open->olist); - tlink_tcon(tlink)->ses->server->ops->set_fid(cfile, fid, oplock); + server->ops->set_fid(cfile, fid, oplock); list_add(&cfile->tlist, &tcon->openFileList); /* if readable file instance put first in list*/ @@ -1422,6 +1450,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; + struct inode *inode = cfile->dentry->d_inode; if (posix_lck) { int posix_lock_type; @@ -1459,6 +1488,21 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, if (!rc) goto out; + /* + * Windows 7 server can delay breaking lease from read to None + * if we set a byte-range lock on a file - break it explicitly + * before sending the lock to the server to be sure the next + * read won't conflict with non-overlapted locks due to + * pagereading. + */ + if (!CIFS_I(inode)->clientCanCacheAll && + CIFS_I(inode)->clientCanCacheRead) { + cifs_invalidate_mapping(inode); + cFYI(1, "Set no oplock for inode=%p due to mand locks", + inode); + CIFS_I(inode)->clientCanCacheRead = false; + } + rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length, type, 1, 0, wait_flag); if (rc) { @@ -2103,15 +2147,7 @@ static int cifs_write_end(struct file *file, struct address_space *mapping, } else { rc = copied; pos += copied; - /* - * When we use strict cache mode and cifs_strict_writev was run - * with level II oplock (indicated by leave_pages_clean field of - * CIFS_I(inode)), we can leave pages clean - cifs_strict_writev - * sent the data to the server itself. - */ - if (!CIFS_I(inode)->leave_pages_clean || - !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)) - set_page_dirty(page); + set_page_dirty(page); } if (rc > 0) { @@ -2462,8 +2498,8 @@ ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov, } static ssize_t -cifs_pagecache_writev(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos, bool cache_ex) +cifs_writev(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; @@ -2485,12 +2521,8 @@ cifs_pagecache_writev(struct kiocb *iocb, const struct iovec *iov, server->vals->exclusive_lock_type, NULL, CIFS_WRITE_OP)) { mutex_lock(&inode->i_mutex); - if (!cache_ex) - cinode->leave_pages_clean = true; rc = __generic_file_aio_write(iocb, iov, nr_segs, - &iocb->ki_pos); - if (!cache_ex) - cinode->leave_pages_clean = false; + &iocb->ki_pos); mutex_unlock(&inode->i_mutex); } @@ -2517,60 +2549,32 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, struct cifsFileInfo *cfile = (struct cifsFileInfo *) iocb->ki_filp->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); - ssize_t written, written2; - /* - * We need to store clientCanCacheAll here to prevent race - * conditions - this value can be changed during an execution - * of generic_file_aio_write. For CIFS it can be changed from - * true to false only, but for SMB2 it can be changed both from - * true to false and vice versa. So, we can end up with a data - * stored in the cache, not marked dirty and not sent to the - * server if this value changes its state from false to true - * after cifs_write_end. - */ - bool cache_ex = cinode->clientCanCacheAll; - bool cache_read = cinode->clientCanCacheRead; - int rc; - loff_t saved_pos; + ssize_t written; - if (cache_ex) { + if (cinode->clientCanCacheAll) { if (cap_unix(tcon->ses) && - ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0) && - (CIFS_UNIX_FCNTL_CAP & le64_to_cpu( - tcon->fsUnixInfo.Capability))) + (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) + && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) return generic_file_aio_write(iocb, iov, nr_segs, pos); - return cifs_pagecache_writev(iocb, iov, nr_segs, pos, cache_ex); + return cifs_writev(iocb, iov, nr_segs, pos); } - /* - * For files without exclusive oplock in strict cache mode we need to - * write the data to the server exactly from the pos to pos+len-1 rather - * than flush all affected pages because it may cause a error with - * mandatory locks on these pages but not on the region from pos to - * ppos+len-1. + * For non-oplocked files in strict cache mode we need to write the data + * to the server exactly from the pos to pos+len-1 rather than flush all + * affected pages because it may cause a error with mandatory locks on + * these pages but not on the region from pos to ppos+len-1. */ written = cifs_user_writev(iocb, iov, nr_segs, pos); - if (!cache_read || written <= 0) - return written; - - saved_pos = iocb->ki_pos; - iocb->ki_pos = pos; - /* we have a read oplock - need to store a data in the page cache */ - if (cap_unix(tcon->ses) && - ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0) && - (CIFS_UNIX_FCNTL_CAP & le64_to_cpu( - tcon->fsUnixInfo.Capability))) - written2 = generic_file_aio_write(iocb, iov, nr_segs, pos); - else - written2 = cifs_pagecache_writev(iocb, iov, nr_segs, pos, - cache_ex); - /* errors occured during writing - invalidate the page cache */ - if (written2 < 0) { - rc = cifs_invalidate_mapping(inode); - if (rc) - written = (ssize_t)rc; - else - iocb->ki_pos = saved_pos; + if (written > 0 && cinode->clientCanCacheRead) { + /* + * Windows 7 server can delay breaking level2 oplock if a write + * request comes - break it on the client to prevent reading + * an old data. + */ + cifs_invalidate_mapping(inode); + cFYI(1, "Set no oplock for inode=%p after a write operation", + inode); + cinode->clientCanCacheRead = false; } return written; } @@ -3577,6 +3581,13 @@ void cifs_oplock_break(struct work_struct *work) struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); int rc = 0; + if (!cinode->clientCanCacheAll && cinode->clientCanCacheRead && + cifs_has_mand_locks(cinode)) { + cFYI(1, "Reset oplock to None for inode=%p due to mand locks", + inode); + cinode->clientCanCacheRead = false; + } + if (inode && S_ISREG(inode->i_mode)) { if (cinode->clientCanCacheRead) break_lease(inode, O_RDONLY); diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index a5d234c8d5d9..47bc5a87f94e 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -53,6 +53,13 @@ send_nt_cancel(struct TCP_Server_Info *server, void *buf, mutex_unlock(&server->srv_mutex); return rc; } + + /* + * The response to this call was already factored into the sequence + * number when the call went out, so we must adjust it back downward + * after signing here. + */ + --server->sequence_number; rc = smb_send(server, in_buf, be32_to_cpu(in_buf->smb_buf_length)); mutex_unlock(&server->srv_mutex); @@ -952,4 +959,5 @@ struct smb_version_values smb1_values = { .cap_unix = CAP_UNIX, .cap_nt_find = CAP_NT_SMBS | CAP_NT_FIND, .cap_large_files = CAP_LARGE_FILES, + .oplock_read = OPLOCK_READ, }; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index d79de7bc4435..c9c7aa7ed966 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -708,6 +708,7 @@ struct smb_version_values smb20_values = { .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, .cap_large_files = SMB2_LARGE_FILES, + .oplock_read = SMB2_OPLOCK_LEVEL_II, }; struct smb_version_values smb21_values = { @@ -725,6 +726,7 @@ struct smb_version_values smb21_values = { .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, .cap_large_files = SMB2_LARGE_FILES, + .oplock_read = SMB2_OPLOCK_LEVEL_II, }; struct smb_version_values smb30_values = { diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 76d974c952fe..1a528680ec5a 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -144,9 +144,6 @@ smb_send_kvec(struct TCP_Server_Info *server, struct kvec *iov, size_t n_vec, *sent = 0; - if (ssocket == NULL) - return -ENOTSOCK; /* BB eventually add reconnect code here */ - smb_msg.msg_name = (struct sockaddr *) &server->dstaddr; smb_msg.msg_namelen = sizeof(struct sockaddr); smb_msg.msg_control = NULL; @@ -291,6 +288,9 @@ smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst) struct socket *ssocket = server->ssocket; int val = 1; + if (ssocket == NULL) + return -ENOTSOCK; + cFYI(1, "Sending smb: smb_len=%u", smb_buf_length); dump_smb(iov[0].iov_base, iov[0].iov_len); diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 153bb1e42e63..a5f12b7e228d 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -176,7 +176,7 @@ static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts) opts->uid = uid; break; case Opt_gid: - if (match_octal(&args[0], &option)) + if (match_int(&args[0], &option)) return -EINVAL; gid = make_kgid(current_user_ns(), option); if (!gid_valid(gid)) diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 7ff49852b0cb..911649a47dd5 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -503,11 +503,11 @@ static ssize_t device_write(struct file *file, const char __user *buf, #endif return -EINVAL; -#ifdef CONFIG_COMPAT - if (count > sizeof(struct dlm_write_request32) + DLM_RESNAME_MAXLEN) -#else + /* + * can't compare against COMPAT/dlm_write_request32 because + * we don't yet know if is64bit is zero + */ if (count > sizeof(struct dlm_write_request) + DLM_RESNAME_MAXLEN) -#endif return -EINVAL; kbuf = kzalloc(count + 1, GFP_NOFS); diff --git a/fs/exec.c b/fs/exec.c index 18c45cac368f..20df02c1cc70 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -434,8 +434,9 @@ static int count(struct user_arg_ptr argv, int max) if (IS_ERR(p)) return -EFAULT; - if (i++ >= max) + if (i >= max) return -E2BIG; + ++i; if (fatal_signal_pending(current)) return -ERESTARTNOHAND; diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 0a475c881852..987358740cb9 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -41,6 +41,7 @@ config EXT4_USE_FOR_EXT23 config EXT4_FS_POSIX_ACL bool "Ext4 POSIX Access Control Lists" + depends on EXT4_FS select FS_POSIX_ACL help POSIX Access Control Lists (ACLs) support permissions for users and @@ -53,6 +54,7 @@ config EXT4_FS_POSIX_ACL config EXT4_FS_SECURITY bool "Ext4 Security Labels" + depends on EXT4_FS help Security labels support alternative access control models implemented by security modules like SELinux. This option diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 8990165346ee..f9ed946a448e 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -722,7 +722,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir, ext4_warning(dir->i_sb, "Node failed checksum"); brelse(bh); *err = ERR_BAD_DX_DIR; - goto fail; + goto fail2; } set_buffer_verified(bh); @@ -2368,7 +2368,6 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir, } inode->i_size = EXT4_I(inode)->i_disksize = blocksize; - dir_block = ext4_bread(handle, inode, 0, 1, &err); if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) { if (!err) { err = -EIO; diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index e95b94945d5f..137af4255da6 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -191,15 +191,14 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type) retval = f2fs_getxattr(inode, name_index, "", value, retval); } - if (retval < 0) { - if (retval == -ENODATA) - acl = NULL; - else - acl = ERR_PTR(retval); - } else { + if (retval > 0) acl = f2fs_acl_from_disk(value, retval); - } + else if (retval == -ENODATA) + acl = NULL; + else + acl = ERR_PTR(retval); kfree(value); + if (!IS_ERR(acl)) set_cached_acl(inode, type, acl); diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 6ef36c37e2be..ff3c8439af87 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -214,7 +214,6 @@ retry: goto retry; } new->ino = ino; - INIT_LIST_HEAD(&new->list); /* add new_oentry into list which is sorted by inode number */ if (orphan) { @@ -772,7 +771,7 @@ void init_orphan_info(struct f2fs_sb_info *sbi) sbi->n_orphans = 0; } -int create_checkpoint_caches(void) +int __init create_checkpoint_caches(void) { orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry", sizeof(struct orphan_inode_entry), NULL); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 3aa5ce7cab83..7bd22a201125 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -547,6 +547,15 @@ redirty_out: #define MAX_DESIRED_PAGES_WP 4096 +static int __f2fs_writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct address_space *mapping = data; + int ret = mapping->a_ops->writepage(page, wbc); + mapping_set_error(mapping, ret); + return ret; +} + static int f2fs_write_data_pages(struct address_space *mapping, struct writeback_control *wbc) { @@ -563,7 +572,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, if (!S_ISDIR(inode->i_mode)) mutex_lock(&sbi->writepages); - ret = generic_writepages(mapping, wbc); + ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); if (!S_ISDIR(inode->i_mode)) mutex_unlock(&sbi->writepages); f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL)); @@ -689,6 +698,11 @@ static int f2fs_set_data_page_dirty(struct page *page) return 0; } +static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping, block, get_data_block_ro); +} + const struct address_space_operations f2fs_dblock_aops = { .readpage = f2fs_read_data_page, .readpages = f2fs_read_data_pages, @@ -700,4 +714,5 @@ const struct address_space_operations f2fs_dblock_aops = { .invalidatepage = f2fs_invalidate_data_page, .releasepage = f2fs_release_data_page, .direct_IO = f2fs_direct_IO, + .bmap = f2fs_bmap, }; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 0e0380a588ad..c8c37307b326 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -26,6 +26,7 @@ static LIST_HEAD(f2fs_stat_list); static struct dentry *debugfs_root; +static DEFINE_MUTEX(f2fs_stat_mutex); static void update_general_status(struct f2fs_sb_info *sbi) { @@ -180,18 +181,14 @@ static int stat_show(struct seq_file *s, void *v) int i = 0; int j; + mutex_lock(&f2fs_stat_mutex); list_for_each_entry_safe(si, next, &f2fs_stat_list, stat_list) { - mutex_lock(&si->stat_lock); - if (!si->sbi) { - mutex_unlock(&si->stat_lock); - continue; - } update_general_status(si->sbi); seq_printf(s, "\n=====[ partition info. #%d ]=====\n", i++); - seq_printf(s, "[SB: 1] [CP: 2] [NAT: %d] [SIT: %d] ", - si->nat_area_segs, si->sit_area_segs); + seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ", + si->sit_area_segs, si->nat_area_segs); seq_printf(s, "[SSA: %d] [MAIN: %d", si->ssa_area_segs, si->main_area_segs); seq_printf(s, "(OverProv:%d Resv:%d)]\n\n", @@ -286,8 +283,8 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, "\nMemory: %u KB = static: %u + cached: %u\n", (si->base_mem + si->cache_mem) >> 10, si->base_mem >> 10, si->cache_mem >> 10); - mutex_unlock(&si->stat_lock); } + mutex_unlock(&f2fs_stat_mutex); return 0; } @@ -303,7 +300,7 @@ static const struct file_operations stat_fops = { .release = single_release, }; -static int init_stats(struct f2fs_sb_info *sbi) +int f2fs_build_stats(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_stat_info *si; @@ -313,9 +310,6 @@ static int init_stats(struct f2fs_sb_info *sbi) return -ENOMEM; si = sbi->stat_info; - mutex_init(&si->stat_lock); - list_add_tail(&si->stat_list, &f2fs_stat_list); - si->all_area_segs = le32_to_cpu(raw_super->segment_count); si->sit_area_segs = le32_to_cpu(raw_super->segment_count_sit); si->nat_area_segs = le32_to_cpu(raw_super->segment_count_nat); @@ -325,21 +319,11 @@ static int init_stats(struct f2fs_sb_info *sbi) si->main_area_zones = si->main_area_sections / le32_to_cpu(raw_super->secs_per_zone); si->sbi = sbi; - return 0; -} -int f2fs_build_stats(struct f2fs_sb_info *sbi) -{ - int retval; - - retval = init_stats(sbi); - if (retval) - return retval; - - if (!debugfs_root) - debugfs_root = debugfs_create_dir("f2fs", NULL); + mutex_lock(&f2fs_stat_mutex); + list_add_tail(&si->stat_list, &f2fs_stat_list); + mutex_unlock(&f2fs_stat_mutex); - debugfs_create_file("status", S_IRUGO, debugfs_root, NULL, &stat_fops); return 0; } @@ -347,14 +331,22 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { struct f2fs_stat_info *si = sbi->stat_info; + mutex_lock(&f2fs_stat_mutex); list_del(&si->stat_list); - mutex_lock(&si->stat_lock); - si->sbi = NULL; - mutex_unlock(&si->stat_lock); + mutex_unlock(&f2fs_stat_mutex); + kfree(sbi->stat_info); } -void destroy_root_stats(void) +void __init f2fs_create_root_stats(void) +{ + debugfs_root = debugfs_create_dir("f2fs", NULL); + if (debugfs_root) + debugfs_create_file("status", S_IRUGO, debugfs_root, + NULL, &stat_fops); +} + +void f2fs_destroy_root_stats(void) { debugfs_remove_recursive(debugfs_root); debugfs_root = NULL; diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 951ed52748f6..989980e16d0b 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -503,7 +503,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, } if (inode) { - inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + inode->i_ctime = CURRENT_TIME; drop_nlink(inode); if (S_ISDIR(inode->i_mode)) { drop_nlink(inode); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 13c6dfbb7183..c8e2d751ef9c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -211,11 +211,11 @@ struct dnode_of_data { static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode, struct page *ipage, struct page *npage, nid_t nid) { + memset(dn, 0, sizeof(*dn)); dn->inode = inode; dn->inode_page = ipage; dn->node_page = npage; dn->nid = nid; - dn->inode_page_locked = 0; } /* @@ -877,6 +877,8 @@ bool f2fs_empty_dir(struct inode *); * super.c */ int f2fs_sync_fs(struct super_block *, int); +extern __printf(3, 4) +void f2fs_msg(struct super_block *, const char *, const char *, ...); /* * hash.c @@ -912,7 +914,7 @@ int restore_node_summary(struct f2fs_sb_info *, unsigned int, void flush_nat_entries(struct f2fs_sb_info *); int build_node_manager(struct f2fs_sb_info *); void destroy_node_manager(struct f2fs_sb_info *); -int create_node_manager_caches(void); +int __init create_node_manager_caches(void); void destroy_node_manager_caches(void); /* @@ -964,7 +966,7 @@ void sync_dirty_dir_inodes(struct f2fs_sb_info *); void block_operations(struct f2fs_sb_info *); void write_checkpoint(struct f2fs_sb_info *, bool, bool); void init_orphan_info(struct f2fs_sb_info *); -int create_checkpoint_caches(void); +int __init create_checkpoint_caches(void); void destroy_checkpoint_caches(void); /* @@ -984,9 +986,9 @@ int do_write_data_page(struct page *); int start_gc_thread(struct f2fs_sb_info *); void stop_gc_thread(struct f2fs_sb_info *); block_t start_bidx_of_node(unsigned int); -int f2fs_gc(struct f2fs_sb_info *, int); +int f2fs_gc(struct f2fs_sb_info *); void build_gc_manager(struct f2fs_sb_info *); -int create_gc_caches(void); +int __init create_gc_caches(void); void destroy_gc_caches(void); /* @@ -1058,7 +1060,8 @@ struct f2fs_stat_info { int f2fs_build_stats(struct f2fs_sb_info *); void f2fs_destroy_stats(struct f2fs_sb_info *); -void destroy_root_stats(void); +void __init f2fs_create_root_stats(void); +void f2fs_destroy_root_stats(void); #else #define stat_inc_call_count(si) #define stat_inc_seg_count(si, type) @@ -1068,7 +1071,8 @@ void destroy_root_stats(void); static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } -static inline void destroy_root_stats(void) { } +static inline void __init f2fs_create_root_stats(void) { } +static inline void f2fs_destroy_root_stats(void) { } #endif extern const struct file_operations f2fs_dir_operations; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7f9ea9271ebe..3191b52aafb0 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -96,8 +96,9 @@ out: } static const struct vm_operations_struct f2fs_file_vm_ops = { - .fault = filemap_fault, - .page_mkwrite = f2fs_vm_page_mkwrite, + .fault = filemap_fault, + .page_mkwrite = f2fs_vm_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; static int need_to_sync_dir(struct f2fs_sb_info *sbi, struct inode *inode) @@ -137,6 +138,9 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret) return ret; + /* guarantee free sections for fsync */ + f2fs_balance_fs(sbi); + mutex_lock(&inode->i_mutex); if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) @@ -407,6 +411,8 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) struct dnode_of_data dn; struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + f2fs_balance_fs(sbi); + mutex_lock_op(sbi, DATA_TRUNC); set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, index, RDONLY_NODE); @@ -534,7 +540,6 @@ static long f2fs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file->f_path.dentry->d_inode; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); long ret; if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) @@ -545,7 +550,10 @@ static long f2fs_fallocate(struct file *file, int mode, else ret = expand_inode_data(inode, offset, len, mode); - f2fs_balance_fs(sbi); + if (!ret) { + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(inode); + } return ret; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index b0ec721e984a..c386910dacc5 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -78,7 +78,7 @@ static int gc_thread_func(void *data) sbi->bg_gc++; - if (f2fs_gc(sbi, 1) == GC_NONE) + if (f2fs_gc(sbi) == GC_NONE) wait_ms = GC_THREAD_NOGC_SLEEP_TIME; else if (wait_ms == GC_THREAD_NOGC_SLEEP_TIME) wait_ms = GC_THREAD_MAX_SLEEP_TIME; @@ -424,7 +424,11 @@ next_step: } /* - * Calculate start block index that this node page contains + * Calculate start block index indicating the given node offset. + * Be careful, caller should give this node offset only indicating direct node + * blocks. If any node offsets, which point the other types of node blocks such + * as indirect or double indirect node blocks, are given, it must be a caller's + * bug. */ block_t start_bidx_of_node(unsigned int node_ofs) { @@ -651,62 +655,44 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, return ret; } -int f2fs_gc(struct f2fs_sb_info *sbi, int nGC) +int f2fs_gc(struct f2fs_sb_info *sbi) { - unsigned int segno; - int old_free_secs, cur_free_secs; - int gc_status, nfree; struct list_head ilist; + unsigned int segno, i; int gc_type = BG_GC; + int gc_status = GC_NONE; INIT_LIST_HEAD(&ilist); gc_more: - nfree = 0; - gc_status = GC_NONE; + if (!(sbi->sb->s_flags & MS_ACTIVE)) + goto stop; if (has_not_enough_free_secs(sbi)) - old_free_secs = reserved_sections(sbi); - else - old_free_secs = free_sections(sbi); - - while (sbi->sb->s_flags & MS_ACTIVE) { - int i; - if (has_not_enough_free_secs(sbi)) - gc_type = FG_GC; + gc_type = FG_GC; - cur_free_secs = free_sections(sbi) + nfree; + if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE)) + goto stop; - /* We got free space successfully. */ - if (nGC < cur_free_secs - old_free_secs) - break; - - if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE)) + for (i = 0; i < sbi->segs_per_sec; i++) { + /* + * do_garbage_collect will give us three gc_status: + * GC_ERROR, GC_DONE, and GC_BLOCKED. + * If GC is finished uncleanly, we have to return + * the victim to dirty segment list. + */ + gc_status = do_garbage_collect(sbi, segno + i, &ilist, gc_type); + if (gc_status != GC_DONE) break; - - for (i = 0; i < sbi->segs_per_sec; i++) { - /* - * do_garbage_collect will give us three gc_status: - * GC_ERROR, GC_DONE, and GC_BLOCKED. - * If GC is finished uncleanly, we have to return - * the victim to dirty segment list. - */ - gc_status = do_garbage_collect(sbi, segno + i, - &ilist, gc_type); - if (gc_status != GC_DONE) - goto stop; - nfree++; - } } -stop: - if (has_not_enough_free_secs(sbi) || gc_status == GC_BLOCKED) { + if (has_not_enough_free_secs(sbi)) { write_checkpoint(sbi, (gc_status == GC_BLOCKED), false); - if (nfree) + if (has_not_enough_free_secs(sbi)) goto gc_more; } +stop: mutex_unlock(&sbi->gc_mutex); put_gc_inode(&ilist); - BUG_ON(!list_empty(&ilist)); return gc_status; } @@ -715,7 +701,7 @@ void build_gc_manager(struct f2fs_sb_info *sbi) DIRTY_I(sbi)->v_ops = &default_v_ops; } -int create_gc_caches(void) +int __init create_gc_caches(void) { winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes", sizeof(struct inode_entry), NULL); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index bf20b4d03214..794241777322 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -217,6 +217,9 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) inode->i_ino == F2FS_META_INO(sbi)) return 0; + if (wbc) + f2fs_balance_fs(sbi); + node_page = get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) return PTR_ERR(node_page); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 5066bfd256c9..9bda63c9c166 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1124,6 +1124,12 @@ static int f2fs_write_node_page(struct page *page, return 0; } +/* + * It is very important to gather dirty pages and write at once, so that we can + * submit a big bio without interfering other data writes. + * Be default, 512 pages (2MB), a segment size, is quite reasonable. + */ +#define COLLECT_DIRTY_NODES 512 static int f2fs_write_node_pages(struct address_space *mapping, struct writeback_control *wbc) { @@ -1131,17 +1137,16 @@ static int f2fs_write_node_pages(struct address_space *mapping, struct block_device *bdev = sbi->sb->s_bdev; long nr_to_write = wbc->nr_to_write; - if (wbc->for_kupdate) - return 0; - - if (get_pages(sbi, F2FS_DIRTY_NODES) == 0) - return 0; - + /* First check balancing cached NAT entries */ if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK)) { write_checkpoint(sbi, false, false); return 0; } + /* collect a number of dirty node pages and write together */ + if (get_pages(sbi, F2FS_DIRTY_NODES) < COLLECT_DIRTY_NODES) + return 0; + /* if mounting is failed, skip writing node pages */ wbc->nr_to_write = bio_get_nr_vecs(bdev); sync_node_pages(sbi, 0, wbc); @@ -1732,7 +1737,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) kfree(nm_i); } -int create_node_manager_caches(void) +int __init create_node_manager_caches(void) { nat_entry_slab = f2fs_kmem_cache_create("nat_entry", sizeof(struct nat_entry), NULL); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index b571fee677d5..f42e4060b399 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -67,7 +67,7 @@ static int recover_dentry(struct page *ipage, struct inode *inode) kunmap(page); f2fs_put_page(page, 0); } else { - f2fs_add_link(&dent, inode); + err = f2fs_add_link(&dent, inode); } iput(dir); out: @@ -151,7 +151,6 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) goto out; } - INIT_LIST_HEAD(&entry->list); list_add_tail(&entry->list, head); entry->blkaddr = blkaddr; } @@ -174,10 +173,9 @@ out: static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) { - struct list_head *this; - struct fsync_inode_entry *entry; - list_for_each(this, head) { - entry = list_entry(this, struct fsync_inode_entry, list); + struct fsync_inode_entry *entry, *tmp; + + list_for_each_entry_safe(entry, tmp, head, list) { iput(entry->inode); list_del(&entry->list); kmem_cache_free(fsync_entry_slab, entry); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index de6240922b0a..4b0099066582 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -31,7 +31,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi) */ if (has_not_enough_free_secs(sbi)) { mutex_lock(&sbi->gc_mutex); - f2fs_gc(sbi, 1); + f2fs_gc(sbi); } } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 08a94c814bdc..37fad04c8669 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -53,6 +53,18 @@ static match_table_t f2fs_tokens = { {Opt_err, NULL}, }; +void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf); + va_end(args); +} + static void init_once(void *foo) { struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo; @@ -125,6 +137,8 @@ int f2fs_sync_fs(struct super_block *sb, int sync) if (sync) write_checkpoint(sbi, false, false); + else + f2fs_balance_fs(sbi); return 0; } @@ -247,7 +261,8 @@ static const struct export_operations f2fs_export_ops = { .get_parent = f2fs_get_parent, }; -static int parse_options(struct f2fs_sb_info *sbi, char *options) +static int parse_options(struct super_block *sb, struct f2fs_sb_info *sbi, + char *options) { substring_t args[MAX_OPT_ARGS]; char *p; @@ -286,7 +301,8 @@ static int parse_options(struct f2fs_sb_info *sbi, char *options) break; #else case Opt_nouser_xattr: - pr_info("nouser_xattr options not supported\n"); + f2fs_msg(sb, KERN_INFO, + "nouser_xattr options not supported"); break; #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL @@ -295,7 +311,7 @@ static int parse_options(struct f2fs_sb_info *sbi, char *options) break; #else case Opt_noacl: - pr_info("noacl options not supported\n"); + f2fs_msg(sb, KERN_INFO, "noacl options not supported"); break; #endif case Opt_active_logs: @@ -309,8 +325,9 @@ static int parse_options(struct f2fs_sb_info *sbi, char *options) set_opt(sbi, DISABLE_EXT_IDENTIFY); break; default: - pr_err("Unrecognized mount option \"%s\" or missing value\n", - p); + f2fs_msg(sb, KERN_ERR, + "Unrecognized mount option \"%s\" or missing value", + p); return -EINVAL; } } @@ -337,23 +354,36 @@ static loff_t max_file_size(unsigned bits) return result; } -static int sanity_check_raw_super(struct f2fs_super_block *raw_super) +static int sanity_check_raw_super(struct super_block *sb, + struct f2fs_super_block *raw_super) { unsigned int blocksize; - if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) + if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) { + f2fs_msg(sb, KERN_INFO, + "Magic Mismatch, valid(0x%x) - read(0x%x)", + F2FS_SUPER_MAGIC, le32_to_cpu(raw_super->magic)); return 1; + } /* Currently, support only 4KB block size */ blocksize = 1 << le32_to_cpu(raw_super->log_blocksize); - if (blocksize != PAGE_CACHE_SIZE) + if (blocksize != PAGE_CACHE_SIZE) { + f2fs_msg(sb, KERN_INFO, + "Invalid blocksize (%u), supports only 4KB\n", + blocksize); return 1; + } if (le32_to_cpu(raw_super->log_sectorsize) != - F2FS_LOG_SECTOR_SIZE) + F2FS_LOG_SECTOR_SIZE) { + f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize"); return 1; + } if (le32_to_cpu(raw_super->log_sectors_per_block) != - F2FS_LOG_SECTORS_PER_BLOCK) + F2FS_LOG_SECTORS_PER_BLOCK) { + f2fs_msg(sb, KERN_INFO, "Invalid log sectors per block"); return 1; + } return 0; } @@ -413,14 +443,17 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (!sbi) return -ENOMEM; - /* set a temporary block size */ - if (!sb_set_blocksize(sb, F2FS_BLKSIZE)) + /* set a block size */ + if (!sb_set_blocksize(sb, F2FS_BLKSIZE)) { + f2fs_msg(sb, KERN_ERR, "unable to set blocksize"); goto free_sbi; + } /* read f2fs raw super block */ raw_super_buf = sb_bread(sb, 0); if (!raw_super_buf) { err = -EIO; + f2fs_msg(sb, KERN_ERR, "unable to read superblock"); goto free_sbi; } raw_super = (struct f2fs_super_block *) @@ -438,12 +471,14 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) set_opt(sbi, POSIX_ACL); #endif /* parse mount options */ - if (parse_options(sbi, (char *)data)) + if (parse_options(sb, sbi, (char *)data)) goto free_sb_buf; /* sanity checking of raw super */ - if (sanity_check_raw_super(raw_super)) + if (sanity_check_raw_super(sb, raw_super)) { + f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem"); goto free_sb_buf; + } sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize)); sb->s_max_links = F2FS_LINK_MAX; @@ -477,18 +512,23 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) /* get an inode for meta space */ sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi)); if (IS_ERR(sbi->meta_inode)) { + f2fs_msg(sb, KERN_ERR, "Failed to read F2FS meta data inode"); err = PTR_ERR(sbi->meta_inode); goto free_sb_buf; } err = get_valid_checkpoint(sbi); - if (err) + if (err) { + f2fs_msg(sb, KERN_ERR, "Failed to get valid F2FS checkpoint"); goto free_meta_inode; + } /* sanity checking of checkpoint */ err = -EINVAL; - if (sanity_check_ckpt(raw_super, sbi->ckpt)) + if (sanity_check_ckpt(raw_super, sbi->ckpt)) { + f2fs_msg(sb, KERN_ERR, "Invalid F2FS checkpoint"); goto free_cp; + } sbi->total_valid_node_count = le32_to_cpu(sbi->ckpt->valid_node_count); @@ -502,25 +542,28 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) INIT_LIST_HEAD(&sbi->dir_inode_list); spin_lock_init(&sbi->dir_inode_lock); - /* init super block */ - if (!sb_set_blocksize(sb, sbi->blocksize)) - goto free_cp; - init_orphan_info(sbi); /* setup f2fs internal modules */ err = build_segment_manager(sbi); - if (err) + if (err) { + f2fs_msg(sb, KERN_ERR, + "Failed to initialize F2FS segment manager"); goto free_sm; + } err = build_node_manager(sbi); - if (err) + if (err) { + f2fs_msg(sb, KERN_ERR, + "Failed to initialize F2FS node manager"); goto free_nm; + } build_gc_manager(sbi); /* get an inode for node space */ sbi->node_inode = f2fs_iget(sb, F2FS_NODE_INO(sbi)); if (IS_ERR(sbi->node_inode)) { + f2fs_msg(sb, KERN_ERR, "Failed to read node inode"); err = PTR_ERR(sbi->node_inode); goto free_nm; } @@ -533,6 +576,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) /* read root inode and dentry */ root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); if (IS_ERR(root)) { + f2fs_msg(sb, KERN_ERR, "Failed to read root inode"); err = PTR_ERR(root); goto free_node_inode; } @@ -596,7 +640,7 @@ static struct file_system_type f2fs_fs_type = { .fs_flags = FS_REQUIRES_DEV, }; -static int init_inodecache(void) +static int __init init_inodecache(void) { f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache", sizeof(struct f2fs_inode_info), NULL); @@ -631,14 +675,17 @@ static int __init init_f2fs_fs(void) err = create_checkpoint_caches(); if (err) goto fail; - return register_filesystem(&f2fs_fs_type); + err = register_filesystem(&f2fs_fs_type); + if (err) + goto fail; + f2fs_create_root_stats(); fail: return err; } static void __exit exit_f2fs_fs(void) { - destroy_root_stats(); + f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); destroy_checkpoint_caches(); destroy_gc_caches(); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 940136a3d3a6..8038c0496504 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -318,6 +318,8 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, if (name_len > 255 || value_len > MAX_VALUE_LEN) return -ERANGE; + f2fs_balance_fs(sbi); + mutex_lock_op(sbi, NODE_NEW); if (!fi->i_xattr_nid) { /* Allocate new attribute block */ diff --git a/fs/file.c b/fs/file.c index 15cb8618e95d..2b3570b7caeb 100644 --- a/fs/file.c +++ b/fs/file.c @@ -490,7 +490,7 @@ void exit_files(struct task_struct *tsk) } } -static void __devinit fdtable_defer_list_init(int cpu) +static void fdtable_defer_list_init(int cpu) { struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu); spin_lock_init(&fddef->lock); diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index 0cf160a94eda..1b2f6c2c3aaf 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -4,12 +4,24 @@ config FUSE_FS With FUSE it is possible to implement a fully functional filesystem in a userspace program. - There's also companion library: libfuse. This library along with - utilities is available from the FUSE homepage: + There's also a companion library: libfuse2. This library is available + from the FUSE homepage: <http://fuse.sourceforge.net/> + although chances are your distribution already has that library + installed if you've installed the "fuse" package itself. See <file:Documentation/filesystems/fuse.txt> for more information. See <file:Documentation/Changes> for needed library/utility version. If you want to develop a userspace FS, or if you want to use a filesystem based on FUSE, answer Y or M. + +config CUSE + tristate "Character device in Userspace support" + depends on FUSE_FS + help + This FUSE extension allows character devices to be + implemented in userspace. + + If you want to develop or use a userspace character device + based on CUSE, answer Y or M. diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index ee8d55042298..6f96a8def147 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -45,7 +45,6 @@ #include <linux/miscdevice.h> #include <linux/mutex.h> #include <linux/slab.h> -#include <linux/spinlock.h> #include <linux/stat.h> #include <linux/module.h> @@ -63,7 +62,7 @@ struct cuse_conn { bool unrestricted_ioctl; }; -static DEFINE_SPINLOCK(cuse_lock); /* protects cuse_conntbl */ +static DEFINE_MUTEX(cuse_lock); /* protects registration */ static struct list_head cuse_conntbl[CUSE_CONNTBL_LEN]; static struct class *cuse_class; @@ -92,19 +91,22 @@ static ssize_t cuse_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { loff_t pos = 0; + struct iovec iov = { .iov_base = buf, .iov_len = count }; - return fuse_direct_io(file, buf, count, &pos, 0); + return fuse_direct_io(file, &iov, 1, count, &pos, 0); } static ssize_t cuse_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { loff_t pos = 0; + struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; + /* * No locking or generic_write_checks(), the server is * responsible for locking and sanity checks. */ - return fuse_direct_io(file, buf, count, &pos, 1); + return fuse_direct_io(file, &iov, 1, count, &pos, 1); } static int cuse_open(struct inode *inode, struct file *file) @@ -114,14 +116,14 @@ static int cuse_open(struct inode *inode, struct file *file) int rc; /* look up and get the connection */ - spin_lock(&cuse_lock); + mutex_lock(&cuse_lock); list_for_each_entry(pos, cuse_conntbl_head(devt), list) if (pos->dev->devt == devt) { fuse_conn_get(&pos->fc); cc = pos; break; } - spin_unlock(&cuse_lock); + mutex_unlock(&cuse_lock); /* dead? */ if (!cc) @@ -267,7 +269,7 @@ static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp) static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo) { char *end = p + len; - char *key, *val; + char *uninitialized_var(key), *uninitialized_var(val); int rc; while (true) { @@ -305,14 +307,14 @@ static void cuse_gendev_release(struct device *dev) */ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) { - struct cuse_conn *cc = fc_to_cc(fc); + struct cuse_conn *cc = fc_to_cc(fc), *pos; struct cuse_init_out *arg = req->out.args[0].value; struct page *page = req->pages[0]; struct cuse_devinfo devinfo = { }; struct device *dev; struct cdev *cdev; dev_t devt; - int rc; + int rc, i; if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) { @@ -356,15 +358,24 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) dev_set_drvdata(dev, cc); dev_set_name(dev, "%s", devinfo.name); + mutex_lock(&cuse_lock); + + /* make sure the device-name is unique */ + for (i = 0; i < CUSE_CONNTBL_LEN; ++i) { + list_for_each_entry(pos, &cuse_conntbl[i], list) + if (!strcmp(dev_name(pos->dev), dev_name(dev))) + goto err_unlock; + } + rc = device_add(dev); if (rc) - goto err_device; + goto err_unlock; /* register cdev */ rc = -ENOMEM; cdev = cdev_alloc(); if (!cdev) - goto err_device; + goto err_unlock; cdev->owner = THIS_MODULE; cdev->ops = &cuse_frontend_fops; @@ -377,9 +388,8 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) cc->cdev = cdev; /* make the device available */ - spin_lock(&cuse_lock); list_add(&cc->list, cuse_conntbl_head(devt)); - spin_unlock(&cuse_lock); + mutex_unlock(&cuse_lock); /* announce device availability */ dev_set_uevent_suppress(dev, 0); @@ -391,7 +401,8 @@ out: err_cdev: cdev_del(cdev); -err_device: +err_unlock: + mutex_unlock(&cuse_lock); put_device(dev); err_region: unregister_chrdev_region(devt, 1); @@ -411,7 +422,7 @@ static int cuse_send_init(struct cuse_conn *cc) BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE); - req = fuse_get_req(fc); + req = fuse_get_req(fc, 1); if (IS_ERR(req)) { rc = PTR_ERR(req); goto err; @@ -441,6 +452,7 @@ static int cuse_send_init(struct cuse_conn *cc) req->out.argvar = 1; req->out.argpages = 1; req->pages[0] = page; + req->page_descs[0].length = req->out.args[1].size; req->num_pages = 1; req->end = cuse_process_init_reply; fuse_request_send_background(fc, req); @@ -520,9 +532,9 @@ static int cuse_channel_release(struct inode *inode, struct file *file) int rc; /* remove from the conntbl, no more access from this point on */ - spin_lock(&cuse_lock); + mutex_lock(&cuse_lock); list_del_init(&cc->list); - spin_unlock(&cuse_lock); + mutex_unlock(&cuse_lock); /* remove device */ if (cc->dev) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index c16335315e5d..e9bdec0b16d9 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -34,34 +34,67 @@ static struct fuse_conn *fuse_get_conn(struct file *file) return file->private_data; } -static void fuse_request_init(struct fuse_req *req) +static void fuse_request_init(struct fuse_req *req, struct page **pages, + struct fuse_page_desc *page_descs, + unsigned npages) { memset(req, 0, sizeof(*req)); + memset(pages, 0, sizeof(*pages) * npages); + memset(page_descs, 0, sizeof(*page_descs) * npages); INIT_LIST_HEAD(&req->list); INIT_LIST_HEAD(&req->intr_entry); init_waitqueue_head(&req->waitq); atomic_set(&req->count, 1); + req->pages = pages; + req->page_descs = page_descs; + req->max_pages = npages; } -struct fuse_req *fuse_request_alloc(void) +static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags) { - struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_KERNEL); - if (req) - fuse_request_init(req); + struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, flags); + if (req) { + struct page **pages; + struct fuse_page_desc *page_descs; + + if (npages <= FUSE_REQ_INLINE_PAGES) { + pages = req->inline_pages; + page_descs = req->inline_page_descs; + } else { + pages = kmalloc(sizeof(struct page *) * npages, flags); + page_descs = kmalloc(sizeof(struct fuse_page_desc) * + npages, flags); + } + + if (!pages || !page_descs) { + kfree(pages); + kfree(page_descs); + kmem_cache_free(fuse_req_cachep, req); + return NULL; + } + + fuse_request_init(req, pages, page_descs, npages); + } return req; } + +struct fuse_req *fuse_request_alloc(unsigned npages) +{ + return __fuse_request_alloc(npages, GFP_KERNEL); +} EXPORT_SYMBOL_GPL(fuse_request_alloc); -struct fuse_req *fuse_request_alloc_nofs(void) +struct fuse_req *fuse_request_alloc_nofs(unsigned npages) { - struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_NOFS); - if (req) - fuse_request_init(req); - return req; + return __fuse_request_alloc(npages, GFP_NOFS); } void fuse_request_free(struct fuse_req *req) { + if (req->pages != req->inline_pages) { + kfree(req->pages); + kfree(req->page_descs); + } kmem_cache_free(fuse_req_cachep, req); } @@ -97,7 +130,7 @@ static void fuse_req_init_context(struct fuse_req *req) req->in.h.pid = current->pid; } -struct fuse_req *fuse_get_req(struct fuse_conn *fc) +struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages) { struct fuse_req *req; sigset_t oldset; @@ -116,7 +149,7 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc) if (!fc->connected) goto out; - req = fuse_request_alloc(); + req = fuse_request_alloc(npages); err = -ENOMEM; if (!req) goto out; @@ -165,7 +198,7 @@ static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req) struct fuse_file *ff = file->private_data; spin_lock(&fc->lock); - fuse_request_init(req); + fuse_request_init(req, req->pages, req->page_descs, req->max_pages); BUG_ON(ff->reserved_req); ff->reserved_req = req; wake_up_all(&fc->reserved_req_waitq); @@ -186,13 +219,14 @@ static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req) * filesystem should not have it's own file open. If deadlock is * intentional, it can still be broken by "aborting" the filesystem. */ -struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file) +struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, + struct file *file) { struct fuse_req *req; atomic_inc(&fc->num_waiting); wait_event(fc->blocked_waitq, !fc->blocked); - req = fuse_request_alloc(); + req = fuse_request_alloc(0); if (!req) req = get_reserved_req(fc, file); @@ -406,9 +440,8 @@ __acquires(fc->lock) } } -void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) +static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) { - req->isreply = 1; spin_lock(&fc->lock); if (!fc->connected) req->out.h.error = -ENOTCONN; @@ -425,6 +458,12 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) } spin_unlock(&fc->lock); } + +void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) +{ + req->isreply = 1; + __fuse_request_send(fc, req); +} EXPORT_SYMBOL_GPL(fuse_request_send); static void fuse_request_send_nowait_locked(struct fuse_conn *fc, @@ -491,6 +530,27 @@ void fuse_request_send_background_locked(struct fuse_conn *fc, fuse_request_send_nowait_locked(fc, req); } +void fuse_force_forget(struct file *file, u64 nodeid) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + struct fuse_forget_in inarg; + + memset(&inarg, 0, sizeof(inarg)); + inarg.nlookup = 1; + req = fuse_get_req_nofail_nopages(fc, file); + req->in.h.opcode = FUSE_FORGET; + req->in.h.nodeid = nodeid; + req->in.numargs = 1; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + req->isreply = 0; + __fuse_request_send(fc, req); + /* ignore errors */ + fuse_put_request(fc, req); +} + /* * Lock the request. Up to the next unlock_request() there mustn't be * anything that could cause a page-fault. If the request was already @@ -692,8 +752,6 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) struct page *oldpage = *pagep; struct page *newpage; struct pipe_buffer *buf = cs->pipebufs; - struct address_space *mapping; - pgoff_t index; unlock_request(cs->fc, cs->req); fuse_copy_finish(cs); @@ -724,9 +782,6 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (fuse_check_page(newpage) != 0) goto out_fallback_unlock; - mapping = oldpage->mapping; - index = oldpage->index; - /* * This is a new and locked page, it shouldn't be mapped or * have any special flags on it @@ -855,11 +910,11 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, { unsigned i; struct fuse_req *req = cs->req; - unsigned offset = req->page_offset; - unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset); for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) { int err; + unsigned offset = req->page_descs[i].offset; + unsigned count = min(nbytes, req->page_descs[i].length); err = fuse_copy_page(cs, &req->pages[i], offset, count, zeroing); @@ -867,8 +922,6 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, return err; nbytes -= count; - count = min(nbytes, (unsigned) PAGE_SIZE); - offset = 0; } return 0; } @@ -1541,29 +1594,34 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, unsigned int num; unsigned int offset; size_t total_len = 0; + int num_pages; + + offset = outarg->offset & ~PAGE_CACHE_MASK; + file_size = i_size_read(inode); + + num = outarg->size; + if (outarg->offset > file_size) + num = 0; + else if (outarg->offset + num > file_size) + num = file_size - outarg->offset; - req = fuse_get_req(fc); + num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; + num_pages = min(num_pages, FUSE_MAX_PAGES_PER_REQ); + + req = fuse_get_req(fc, num_pages); if (IS_ERR(req)) return PTR_ERR(req); - offset = outarg->offset & ~PAGE_CACHE_MASK; - req->in.h.opcode = FUSE_NOTIFY_REPLY; req->in.h.nodeid = outarg->nodeid; req->in.numargs = 2; req->in.argpages = 1; - req->page_offset = offset; + req->page_descs[0].offset = offset; req->end = fuse_retrieve_end; index = outarg->offset >> PAGE_CACHE_SHIFT; - file_size = i_size_read(inode); - num = outarg->size; - if (outarg->offset > file_size) - num = 0; - else if (outarg->offset + num > file_size) - num = file_size - outarg->offset; - while (num && req->num_pages < FUSE_MAX_PAGES_PER_REQ) { + while (num && req->num_pages < num_pages) { struct page *page; unsigned int this_num; @@ -1573,6 +1631,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset); req->pages[req->num_pages] = page; + req->page_descs[req->num_pages].length = this_num; req->num_pages++; offset = 0; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index b7c09f9eb40c..85065221a58a 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -14,6 +14,29 @@ #include <linux/namei.h> #include <linux/slab.h> +static bool fuse_use_readdirplus(struct inode *dir, struct file *filp) +{ + struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_inode *fi = get_fuse_inode(dir); + + if (!fc->do_readdirplus) + return false; + if (!fc->readdirplus_auto) + return true; + if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state)) + return true; + if (filp->f_pos == 0) + return true; + return false; +} + +static void fuse_advise_use_readdirplus(struct inode *dir) +{ + struct fuse_inode *fi = get_fuse_inode(dir); + + set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state); +} + #if BITS_PER_LONG >= 64 static inline void fuse_dentry_settime(struct dentry *entry, u64 time) { @@ -178,7 +201,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) return -ECHILD; fc = get_fuse_conn(inode); - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return 0; @@ -219,6 +242,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) attr_version); fuse_change_entry_timeout(entry, &outarg); } + fuse_advise_use_readdirplus(inode); return 1; } @@ -271,7 +295,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name, if (name->len > FUSE_NAME_MAX) goto out; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); err = PTR_ERR(req); if (IS_ERR(req)) goto out; @@ -355,6 +379,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, else fuse_invalidate_entry_cache(entry); + fuse_advise_use_readdirplus(dir); return newent; out_iput: @@ -391,7 +416,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, if (!forget) goto out_err; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); err = PTR_ERR(req); if (IS_ERR(req)) goto out_put_forget_req; @@ -592,7 +617,7 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode, { struct fuse_mknod_in inarg; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_req(fc); + struct fuse_req *req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -623,7 +648,7 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode) { struct fuse_mkdir_in inarg; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_req(fc); + struct fuse_req *req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -647,7 +672,7 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry, { struct fuse_conn *fc = get_fuse_conn(dir); unsigned len = strlen(link) + 1; - struct fuse_req *req = fuse_get_req(fc); + struct fuse_req *req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -664,7 +689,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) { int err; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_req(fc); + struct fuse_req *req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -682,7 +707,14 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) spin_lock(&fc->lock); fi->attr_version = ++fc->attr_version; - drop_nlink(inode); + /* + * If i_nlink == 0 then unlink doesn't make sense, yet this can + * happen if userspace filesystem is careless. It would be + * difficult to enforce correct nlink usage so just ignore this + * condition here + */ + if (inode->i_nlink > 0) + drop_nlink(inode); spin_unlock(&fc->lock); fuse_invalidate_attr(inode); fuse_invalidate_attr(dir); @@ -696,7 +728,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) { int err; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_req(fc); + struct fuse_req *req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -723,7 +755,7 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, int err; struct fuse_rename_in inarg; struct fuse_conn *fc = get_fuse_conn(olddir); - struct fuse_req *req = fuse_get_req(fc); + struct fuse_req *req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -776,7 +808,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, struct fuse_link_in inarg; struct inode *inode = entry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req = fuse_get_req(fc); + struct fuse_req *req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -848,7 +880,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, struct fuse_req *req; u64 attr_version; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -985,7 +1017,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, /* * Calling into a user-controlled filesystem gives the filesystem - * daemon ptrace-like capabilities over the requester process. This + * daemon ptrace-like capabilities over the current process. This * means, that the filesystem daemon is able to record the exact * filesystem operations performed, and can also control the behavior * of the requester process in otherwise impossible ways. For example @@ -996,27 +1028,23 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, * for which the owner of the mount has ptrace privilege. This * excludes processes started by other users, suid or sgid processes. */ -int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task) +int fuse_allow_current_process(struct fuse_conn *fc) { const struct cred *cred; - int ret; if (fc->flags & FUSE_ALLOW_OTHER) return 1; - rcu_read_lock(); - ret = 0; - cred = __task_cred(task); + cred = current_cred(); if (uid_eq(cred->euid, fc->user_id) && uid_eq(cred->suid, fc->user_id) && uid_eq(cred->uid, fc->user_id) && gid_eq(cred->egid, fc->group_id) && gid_eq(cred->sgid, fc->group_id) && gid_eq(cred->gid, fc->group_id)) - ret = 1; - rcu_read_unlock(); + return 1; - return ret; + return 0; } static int fuse_access(struct inode *inode, int mask) @@ -1029,7 +1057,7 @@ static int fuse_access(struct inode *inode, int mask) if (fc->no_access) return 0; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -1077,7 +1105,7 @@ static int fuse_permission(struct inode *inode, int mask) bool refreshed = false; int err = 0; - if (!fuse_allow_task(fc, current)) + if (!fuse_allow_current_process(fc)) return -EACCES; /* @@ -1155,19 +1183,157 @@ static int parse_dirfile(char *buf, size_t nbytes, struct file *file, return 0; } -static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) +static int fuse_direntplus_link(struct file *file, + struct fuse_direntplus *direntplus, + u64 attr_version) { int err; + struct fuse_entry_out *o = &direntplus->entry_out; + struct fuse_dirent *dirent = &direntplus->dirent; + struct dentry *parent = file->f_path.dentry; + struct qstr name = QSTR_INIT(dirent->name, dirent->namelen); + struct dentry *dentry; + struct dentry *alias; + struct inode *dir = parent->d_inode; + struct fuse_conn *fc; + struct inode *inode; + + if (!o->nodeid) { + /* + * Unlike in the case of fuse_lookup, zero nodeid does not mean + * ENOENT. Instead, it only means the userspace filesystem did + * not want to return attributes/handle for this entry. + * + * So do nothing. + */ + return 0; + } + + if (name.name[0] == '.') { + /* + * We could potentially refresh the attributes of the directory + * and its parent? + */ + if (name.len == 1) + return 0; + if (name.name[1] == '.' && name.len == 2) + return 0; + } + fc = get_fuse_conn(dir); + + name.hash = full_name_hash(name.name, name.len); + dentry = d_lookup(parent, &name); + if (dentry && dentry->d_inode) { + inode = dentry->d_inode; + if (get_node_id(inode) == o->nodeid) { + struct fuse_inode *fi; + fi = get_fuse_inode(inode); + spin_lock(&fc->lock); + fi->nlookup++; + spin_unlock(&fc->lock); + + /* + * The other branch to 'found' comes via fuse_iget() + * which bumps nlookup inside + */ + goto found; + } + err = d_invalidate(dentry); + if (err) + goto out; + dput(dentry); + dentry = NULL; + } + + dentry = d_alloc(parent, &name); + err = -ENOMEM; + if (!dentry) + goto out; + + inode = fuse_iget(dir->i_sb, o->nodeid, o->generation, + &o->attr, entry_attr_timeout(o), attr_version); + if (!inode) + goto out; + + alias = d_materialise_unique(dentry, inode); + err = PTR_ERR(alias); + if (IS_ERR(alias)) + goto out; + if (alias) { + dput(dentry); + dentry = alias; + } + +found: + fuse_change_attributes(inode, &o->attr, entry_attr_timeout(o), + attr_version); + + fuse_change_entry_timeout(dentry, o); + + err = 0; +out: + if (dentry) + dput(dentry); + return err; +} + +static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, + void *dstbuf, filldir_t filldir, u64 attr_version) +{ + struct fuse_direntplus *direntplus; + struct fuse_dirent *dirent; + size_t reclen; + int over = 0; + int ret; + + while (nbytes >= FUSE_NAME_OFFSET_DIRENTPLUS) { + direntplus = (struct fuse_direntplus *) buf; + dirent = &direntplus->dirent; + reclen = FUSE_DIRENTPLUS_SIZE(direntplus); + + if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) + return -EIO; + if (reclen > nbytes) + break; + + if (!over) { + /* We fill entries into dstbuf only as much as + it can hold. But we still continue iterating + over remaining entries to link them. If not, + we need to send a FORGET for each of those + which we did not link. + */ + over = filldir(dstbuf, dirent->name, dirent->namelen, + file->f_pos, dirent->ino, + dirent->type); + file->f_pos = dirent->off; + } + + buf += reclen; + nbytes -= reclen; + + ret = fuse_direntplus_link(file, direntplus, attr_version); + if (ret) + fuse_force_forget(file, direntplus->entry_out.nodeid); + } + + return 0; +} + +static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) +{ + int plus, err; size_t nbytes; struct page *page; struct inode *inode = file->f_path.dentry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_req *req; + u64 attr_version = 0; if (is_bad_inode(inode)) return -EIO; - req = fuse_get_req(fc); + req = fuse_get_req(fc, 1); if (IS_ERR(req)) return PTR_ERR(req); @@ -1176,17 +1342,34 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) fuse_put_request(fc, req); return -ENOMEM; } + + plus = fuse_use_readdirplus(inode, file); req->out.argpages = 1; req->num_pages = 1; req->pages[0] = page; - fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, FUSE_READDIR); + req->page_descs[0].length = PAGE_SIZE; + if (plus) { + attr_version = fuse_get_attr_version(fc); + fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, + FUSE_READDIRPLUS); + } else { + fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, + FUSE_READDIR); + } fuse_request_send(fc, req); nbytes = req->out.args[0].size; err = req->out.h.error; fuse_put_request(fc, req); - if (!err) - err = parse_dirfile(page_address(page), nbytes, file, dstbuf, - filldir); + if (!err) { + if (plus) { + err = parse_dirplusfile(page_address(page), nbytes, + file, dstbuf, filldir, + attr_version); + } else { + err = parse_dirfile(page_address(page), nbytes, file, + dstbuf, filldir); + } + } __free_page(page); fuse_invalidate_attr(inode); /* atime changed */ @@ -1197,7 +1380,7 @@ static char *read_link(struct dentry *dentry) { struct inode *inode = dentry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req = fuse_get_req(fc); + struct fuse_req *req = fuse_get_req_nopages(fc); char *link; if (IS_ERR(req)) @@ -1391,7 +1574,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr, loff_t oldsize; int err; - if (!fuse_allow_task(fc, current)) + if (!fuse_allow_current_process(fc)) return -EACCES; if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) @@ -1410,7 +1593,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr, if (attr->ia_valid & ATTR_SIZE) is_truncate = true; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -1500,7 +1683,7 @@ static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry, struct inode *inode = entry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - if (!fuse_allow_task(fc, current)) + if (!fuse_allow_current_process(fc)) return -EACCES; return fuse_update_attributes(inode, stat, NULL, NULL); @@ -1518,7 +1701,7 @@ static int fuse_setxattr(struct dentry *entry, const char *name, if (fc->no_setxattr) return -EOPNOTSUPP; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -1557,7 +1740,7 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name, if (fc->no_getxattr) return -EOPNOTSUPP; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -1603,13 +1786,13 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) struct fuse_getxattr_out outarg; ssize_t ret; - if (!fuse_allow_task(fc, current)) + if (!fuse_allow_current_process(fc)) return -EACCES; if (fc->no_listxattr) return -EOPNOTSUPP; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -1654,7 +1837,7 @@ static int fuse_removexattr(struct dentry *entry, const char *name) if (fc->no_removexattr) return -EOPNOTSUPP; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index e21d4d8f87e3..c8071768b950 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -25,7 +25,7 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, struct fuse_req *req; int err; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -57,7 +57,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) return NULL; ff->fc = fc; - ff->reserved_req = fuse_request_alloc(); + ff->reserved_req = fuse_request_alloc(0); if (unlikely(!ff->reserved_req)) { kfree(ff); return NULL; @@ -368,7 +368,7 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (fc->no_flush) return 0; - req = fuse_get_req_nofail(fc, file); + req = fuse_get_req_nofail_nopages(fc, file); memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; inarg.lock_owner = fuse_lock_owner_id(fc, id); @@ -436,7 +436,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, fuse_sync_writes(inode); - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) { err = PTR_ERR(req); goto out; @@ -544,7 +544,7 @@ static int fuse_readpage(struct file *file, struct page *page) */ fuse_wait_on_page_writeback(inode, page->index); - req = fuse_get_req(fc); + req = fuse_get_req(fc, 1); err = PTR_ERR(req); if (IS_ERR(req)) goto out; @@ -555,6 +555,7 @@ static int fuse_readpage(struct file *file, struct page *page) req->out.argpages = 1; req->num_pages = 1; req->pages[0] = page; + req->page_descs[0].length = count; num_read = fuse_send_read(req, file, pos, count, NULL); err = req->out.h.error; fuse_put_request(fc, req); @@ -641,6 +642,7 @@ struct fuse_fill_data { struct fuse_req *req; struct file *file; struct inode *inode; + unsigned nr_pages; }; static int fuse_readpages_fill(void *_data, struct page *page) @@ -656,16 +658,26 @@ static int fuse_readpages_fill(void *_data, struct page *page) (req->num_pages == FUSE_MAX_PAGES_PER_REQ || (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read || req->pages[req->num_pages - 1]->index + 1 != page->index)) { + int nr_alloc = min_t(unsigned, data->nr_pages, + FUSE_MAX_PAGES_PER_REQ); fuse_send_readpages(req, data->file); - data->req = req = fuse_get_req(fc); + data->req = req = fuse_get_req(fc, nr_alloc); if (IS_ERR(req)) { unlock_page(page); return PTR_ERR(req); } } + + if (WARN_ON(req->num_pages >= req->max_pages)) { + fuse_put_request(fc, req); + return -EIO; + } + page_cache_get(page); req->pages[req->num_pages] = page; + req->page_descs[req->num_pages].length = PAGE_SIZE; req->num_pages++; + data->nr_pages--; return 0; } @@ -676,6 +688,7 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_fill_data data; int err; + int nr_alloc = min_t(unsigned, nr_pages, FUSE_MAX_PAGES_PER_REQ); err = -EIO; if (is_bad_inode(inode)) @@ -683,7 +696,8 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, data.file = file; data.inode = inode; - data.req = fuse_get_req(fc); + data.req = fuse_get_req(fc, nr_alloc); + data.nr_pages = nr_pages; err = PTR_ERR(data.req); if (IS_ERR(data.req)) goto out; @@ -786,7 +800,7 @@ static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file, res = fuse_send_write(req, file, pos, count, NULL); - offset = req->page_offset; + offset = req->page_descs[0].offset; count = res; for (i = 0; i < req->num_pages; i++) { struct page *page = req->pages[i]; @@ -817,7 +831,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, int err; req->in.argpages = 1; - req->page_offset = offset; + req->page_descs[0].offset = offset; do { size_t tmp; @@ -857,6 +871,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, err = 0; req->pages[req->num_pages] = page; + req->page_descs[req->num_pages].length = tmp; req->num_pages++; iov_iter_advance(ii, tmp); @@ -869,11 +884,19 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, if (!fc->big_writes) break; } while (iov_iter_count(ii) && count < fc->max_write && - req->num_pages < FUSE_MAX_PAGES_PER_REQ && offset == 0); + req->num_pages < req->max_pages && offset == 0); return count > 0 ? count : err; } +static inline unsigned fuse_wr_pages(loff_t pos, size_t len) +{ + return min_t(unsigned, + ((pos + len - 1) >> PAGE_CACHE_SHIFT) - + (pos >> PAGE_CACHE_SHIFT) + 1, + FUSE_MAX_PAGES_PER_REQ); +} + static ssize_t fuse_perform_write(struct file *file, struct address_space *mapping, struct iov_iter *ii, loff_t pos) @@ -889,8 +912,9 @@ static ssize_t fuse_perform_write(struct file *file, do { struct fuse_req *req; ssize_t count; + unsigned nr_pages = fuse_wr_pages(pos, iov_iter_count(ii)); - req = fuse_get_req(fc); + req = fuse_get_req(fc, nr_pages); if (IS_ERR(req)) { err = PTR_ERR(req); break; @@ -1023,47 +1047,110 @@ static void fuse_release_user_pages(struct fuse_req *req, int write) } } -static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf, +static inline void fuse_page_descs_length_init(struct fuse_req *req, + unsigned index, unsigned nr_pages) +{ + int i; + + for (i = index; i < index + nr_pages; i++) + req->page_descs[i].length = PAGE_SIZE - + req->page_descs[i].offset; +} + +static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii) +{ + return (unsigned long)ii->iov->iov_base + ii->iov_offset; +} + +static inline size_t fuse_get_frag_size(const struct iov_iter *ii, + size_t max_size) +{ + return min(iov_iter_single_seg_count(ii), max_size); +} + +static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, size_t *nbytesp, int write) { - size_t nbytes = *nbytesp; - unsigned long user_addr = (unsigned long) buf; - unsigned offset = user_addr & ~PAGE_MASK; - int npages; + size_t nbytes = 0; /* # bytes already packed in req */ /* Special case for kernel I/O: can copy directly into the buffer */ if (segment_eq(get_fs(), KERNEL_DS)) { + unsigned long user_addr = fuse_get_user_addr(ii); + size_t frag_size = fuse_get_frag_size(ii, *nbytesp); + if (write) req->in.args[1].value = (void *) user_addr; else req->out.args[0].value = (void *) user_addr; + iov_iter_advance(ii, frag_size); + *nbytesp = frag_size; return 0; } - nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); - npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; - npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ); - npages = get_user_pages_fast(user_addr, npages, !write, req->pages); - if (npages < 0) - return npages; + while (nbytes < *nbytesp && req->num_pages < req->max_pages) { + unsigned npages; + unsigned long user_addr = fuse_get_user_addr(ii); + unsigned offset = user_addr & ~PAGE_MASK; + size_t frag_size = fuse_get_frag_size(ii, *nbytesp - nbytes); + int ret; + + unsigned n = req->max_pages - req->num_pages; + frag_size = min_t(size_t, frag_size, n << PAGE_SHIFT); + + npages = (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; + npages = clamp(npages, 1U, n); + + ret = get_user_pages_fast(user_addr, npages, !write, + &req->pages[req->num_pages]); + if (ret < 0) + return ret; - req->num_pages = npages; - req->page_offset = offset; + npages = ret; + frag_size = min_t(size_t, frag_size, + (npages << PAGE_SHIFT) - offset); + iov_iter_advance(ii, frag_size); + + req->page_descs[req->num_pages].offset = offset; + fuse_page_descs_length_init(req, req->num_pages, npages); + + req->num_pages += npages; + req->page_descs[req->num_pages - 1].length -= + (npages << PAGE_SHIFT) - offset - frag_size; + + nbytes += frag_size; + } if (write) req->in.argpages = 1; else req->out.argpages = 1; - nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset; - *nbytesp = min(*nbytesp, nbytes); + *nbytesp = nbytes; return 0; } -ssize_t fuse_direct_io(struct file *file, const char __user *buf, - size_t count, loff_t *ppos, int write) +static inline int fuse_iter_npages(const struct iov_iter *ii_p) +{ + struct iov_iter ii = *ii_p; + int npages = 0; + + while (iov_iter_count(&ii) && npages < FUSE_MAX_PAGES_PER_REQ) { + unsigned long user_addr = fuse_get_user_addr(&ii); + unsigned offset = user_addr & ~PAGE_MASK; + size_t frag_size = iov_iter_single_seg_count(&ii); + + npages += (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; + iov_iter_advance(&ii, frag_size); + } + + return min(npages, FUSE_MAX_PAGES_PER_REQ); +} + +ssize_t fuse_direct_io(struct file *file, const struct iovec *iov, + unsigned long nr_segs, size_t count, loff_t *ppos, + int write) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; @@ -1071,8 +1158,11 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf, loff_t pos = *ppos; ssize_t res = 0; struct fuse_req *req; + struct iov_iter ii; + + iov_iter_init(&ii, iov, nr_segs, count, 0); - req = fuse_get_req(fc); + req = fuse_get_req(fc, fuse_iter_npages(&ii)); if (IS_ERR(req)) return PTR_ERR(req); @@ -1080,7 +1170,7 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf, size_t nres; fl_owner_t owner = current->files; size_t nbytes = min(count, nmax); - int err = fuse_get_user_pages(req, buf, &nbytes, write); + int err = fuse_get_user_pages(req, &ii, &nbytes, write); if (err) { res = err; break; @@ -1103,12 +1193,11 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf, count -= nres; res += nres; pos += nres; - buf += nres; if (nres != nbytes) break; if (count) { fuse_put_request(fc, req); - req = fuse_get_req(fc); + req = fuse_get_req(fc, fuse_iter_npages(&ii)); if (IS_ERR(req)) break; } @@ -1122,8 +1211,8 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf, } EXPORT_SYMBOL_GPL(fuse_direct_io); -static ssize_t fuse_direct_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) +static ssize_t __fuse_direct_read(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) { ssize_t res; struct inode *inode = file->f_path.dentry->d_inode; @@ -1131,22 +1220,31 @@ static ssize_t fuse_direct_read(struct file *file, char __user *buf, if (is_bad_inode(inode)) return -EIO; - res = fuse_direct_io(file, buf, count, ppos, 0); + res = fuse_direct_io(file, iov, nr_segs, iov_length(iov, nr_segs), + ppos, 0); fuse_invalidate_attr(inode); return res; } -static ssize_t __fuse_direct_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) +static ssize_t fuse_direct_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct iovec iov = { .iov_base = buf, .iov_len = count }; + return __fuse_direct_read(file, &iov, 1, ppos); +} + +static ssize_t __fuse_direct_write(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) { struct inode *inode = file->f_path.dentry->d_inode; + size_t count = iov_length(iov, nr_segs); ssize_t res; res = generic_write_checks(file, ppos, &count, 0); if (!res) { - res = fuse_direct_io(file, buf, count, ppos, 1); + res = fuse_direct_io(file, iov, nr_segs, count, ppos, 1); if (res > 0) fuse_write_update_size(inode, *ppos); } @@ -1159,6 +1257,7 @@ static ssize_t __fuse_direct_write(struct file *file, const char __user *buf, static ssize_t fuse_direct_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { + struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; struct inode *inode = file->f_path.dentry->d_inode; ssize_t res; @@ -1167,7 +1266,7 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf, /* Don't allow parallel writes to the same file */ mutex_lock(&inode->i_mutex); - res = __fuse_direct_write(file, buf, count, ppos); + res = __fuse_direct_write(file, &iov, 1, ppos); mutex_unlock(&inode->i_mutex); return res; @@ -1272,7 +1371,7 @@ static int fuse_writepage_locked(struct page *page) set_page_writeback(page); - req = fuse_request_alloc_nofs(); + req = fuse_request_alloc_nofs(1); if (!req) goto err; @@ -1293,7 +1392,8 @@ static int fuse_writepage_locked(struct page *page) req->in.argpages = 1; req->num_pages = 1; req->pages[0] = tmp_page; - req->page_offset = 0; + req->page_descs[0].offset = 0; + req->page_descs[0].length = PAGE_SIZE; req->end = fuse_writepage_end; req->inode = inode; @@ -1471,7 +1571,7 @@ static int fuse_getlk(struct file *file, struct file_lock *fl) struct fuse_lk_out outarg; int err; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -1506,7 +1606,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) if (fl->fl_flags & FL_CLOSE) return 0; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -1575,7 +1675,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block) if (!inode->i_sb->s_bdev || fc->no_bmap) return 0; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return 0; @@ -1873,7 +1973,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, num_pages++; } - req = fuse_get_req(fc); + req = fuse_get_req(fc, num_pages); if (IS_ERR(req)) { err = PTR_ERR(req); req = NULL; @@ -1881,6 +1981,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, } memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages); req->num_pages = num_pages; + fuse_page_descs_length_init(req, 0, req->num_pages); /* okay, let's send it to the client */ req->in.h.opcode = FUSE_IOCTL; @@ -1981,7 +2082,7 @@ long fuse_ioctl_common(struct file *file, unsigned int cmd, struct inode *inode = file->f_dentry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - if (!fuse_allow_task(fc, current)) + if (!fuse_allow_current_process(fc)) return -EACCES; if (is_bad_inode(inode)) @@ -2066,6 +2167,7 @@ unsigned fuse_file_poll(struct file *file, poll_table *wait) return DEFAULT_POLLMASK; poll_wait(file, &ff->poll_wait, wait); + inarg.events = (__u32)poll_requested_events(wait); /* * Ask for notification iff there's someone waiting for it. @@ -2076,7 +2178,7 @@ unsigned fuse_file_poll(struct file *file, poll_table *wait) fuse_register_polled_file(fc, ff); } - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return POLLERR; @@ -2126,41 +2228,6 @@ int fuse_notify_poll_wakeup(struct fuse_conn *fc, return 0; } -static ssize_t fuse_loop_dio(struct file *filp, const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos, int rw) -{ - const struct iovec *vector = iov; - ssize_t ret = 0; - - while (nr_segs > 0) { - void __user *base; - size_t len; - ssize_t nr; - - base = vector->iov_base; - len = vector->iov_len; - vector++; - nr_segs--; - - if (rw == WRITE) - nr = __fuse_direct_write(filp, base, len, ppos); - else - nr = fuse_direct_read(filp, base, len, ppos); - - if (nr < 0) { - if (!ret) - ret = nr; - break; - } - ret += nr; - if (nr != len) - break; - } - - return ret; -} - - static ssize_t fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) @@ -2172,13 +2239,16 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, file = iocb->ki_filp; pos = offset; - ret = fuse_loop_dio(file, iov, nr_segs, &pos, rw); + if (rw == WRITE) + ret = __fuse_direct_write(file, iov, nr_segs, &pos); + else + ret = __fuse_direct_read(file, iov, nr_segs, &pos); return ret; } -long fuse_file_fallocate(struct file *file, int mode, loff_t offset, - loff_t length) +static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, + loff_t length) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; @@ -2194,7 +2264,7 @@ long fuse_file_fallocate(struct file *file, int mode, loff_t offset, if (fc->no_fallocate) return -EOPNOTSUPP; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -2213,7 +2283,6 @@ long fuse_file_fallocate(struct file *file, int mode, loff_t offset, return err; } -EXPORT_SYMBOL_GPL(fuse_file_fallocate); static const struct file_operations fuse_file_operations = { .llseek = fuse_file_llseek, diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index e105a53fc72d..6aeba864f070 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -44,6 +44,9 @@ doing the mount will be allowed to access the filesystem */ #define FUSE_ALLOW_OTHER (1 << 1) +/** Number of page pointers embedded in fuse_req */ +#define FUSE_REQ_INLINE_PAGES 1 + /** List of active connections */ extern struct list_head fuse_conn_list; @@ -103,6 +106,15 @@ struct fuse_inode { /** List of writepage requestst (pending or sent) */ struct list_head writepages; + + /** Miscellaneous bits describing inode state */ + unsigned long state; +}; + +/** FUSE inode state bits */ +enum { + /** Advise readdirplus */ + FUSE_I_ADVISE_RDPLUS, }; struct fuse_conn; @@ -200,6 +212,12 @@ struct fuse_out { struct fuse_arg args[3]; }; +/** FUSE page descriptor */ +struct fuse_page_desc { + unsigned int length; + unsigned int offset; +}; + /** The request state */ enum fuse_req_state { FUSE_REQ_INIT = 0, @@ -291,14 +309,23 @@ struct fuse_req { } misc; /** page vector */ - struct page *pages[FUSE_MAX_PAGES_PER_REQ]; + struct page **pages; + + /** page-descriptor vector */ + struct fuse_page_desc *page_descs; + + /** size of the 'pages' array */ + unsigned max_pages; + + /** inline page vector */ + struct page *inline_pages[FUSE_REQ_INLINE_PAGES]; + + /** inline page-descriptor vector */ + struct fuse_page_desc inline_page_descs[FUSE_REQ_INLINE_PAGES]; /** number of pages in vector */ unsigned num_pages; - /** offset of data on first page */ - unsigned page_offset; - /** File used in the request (or NULL) */ struct fuse_file *ff; @@ -487,6 +514,12 @@ struct fuse_conn { /** Use enhanced/automatic page cache invalidation. */ unsigned auto_inval_data:1; + /** Does the filesystem support readdirplus? */ + unsigned do_readdirplus:1; + + /** Does the filesystem want adaptive readdirplus? */ + unsigned readdirplus_auto:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -578,6 +611,9 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, struct fuse_forget_link *fuse_alloc_forget(void); +/* Used by READDIRPLUS */ +void fuse_force_forget(struct file *file, u64 nodeid); + /** * Initialize READ or READDIR request */ @@ -658,9 +694,9 @@ void fuse_ctl_cleanup(void); /** * Allocate a request */ -struct fuse_req *fuse_request_alloc(void); +struct fuse_req *fuse_request_alloc(unsigned npages); -struct fuse_req *fuse_request_alloc_nofs(void); +struct fuse_req *fuse_request_alloc_nofs(unsigned npages); /** * Free a request @@ -668,14 +704,25 @@ struct fuse_req *fuse_request_alloc_nofs(void); void fuse_request_free(struct fuse_req *req); /** - * Get a request, may fail with -ENOMEM + * Get a request, may fail with -ENOMEM, + * caller should specify # elements in req->pages[] explicitly */ -struct fuse_req *fuse_get_req(struct fuse_conn *fc); +struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages); + +/** + * Get a request, may fail with -ENOMEM, + * useful for callers who doesn't use req->pages[] + */ +static inline struct fuse_req *fuse_get_req_nopages(struct fuse_conn *fc) +{ + return fuse_get_req(fc, 0); +} /** * Gets a requests for a file operation, always succeeds */ -struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file); +struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, + struct file *file); /** * Decrement reference count of a request. If count goes to zero free @@ -739,9 +786,9 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc); int fuse_valid_type(int m); /** - * Is task allowed to perform filesystem operation? + * Is current process allowed to perform filesystem operation? */ -int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task); +int fuse_allow_current_process(struct fuse_conn *fc); u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id); @@ -776,8 +823,9 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, bool isdir); -ssize_t fuse_direct_io(struct file *file, const char __user *buf, - size_t count, loff_t *ppos, int write); +ssize_t fuse_direct_io(struct file *file, const struct iovec *iov, + unsigned long nr_segs, size_t count, loff_t *ppos, + int write); long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, unsigned int flags); long fuse_ioctl_common(struct file *file, unsigned int cmd, diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 73ca6b72beaf..01353ed75750 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -92,6 +92,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) fi->attr_version = 0; fi->writectr = 0; fi->orig_ino = 0; + fi->state = 0; INIT_LIST_HEAD(&fi->write_files); INIT_LIST_HEAD(&fi->queued_writes); INIT_LIST_HEAD(&fi->writepages); @@ -408,12 +409,12 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) struct fuse_statfs_out outarg; int err; - if (!fuse_allow_task(fc, current)) { + if (!fuse_allow_current_process(fc)) { buf->f_type = FUSE_SUPER_MAGIC; return 0; } - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -863,6 +864,10 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->dont_mask = 1; if (arg->flags & FUSE_AUTO_INVAL_DATA) fc->auto_inval_data = 1; + if (arg->flags & FUSE_DO_READDIRPLUS) + fc->do_readdirplus = 1; + if (arg->flags & FUSE_READDIRPLUS_AUTO) + fc->readdirplus_auto = 1; } else { ra_pages = fc->max_read / PAGE_CACHE_SIZE; fc->no_lock = 1; @@ -889,7 +894,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | - FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA; + FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | + FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); @@ -1034,12 +1040,12 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) /* only now - we want root dentry with NULL ->d_op */ sb->s_d_op = &fuse_dentry_operations; - init_req = fuse_request_alloc(); + init_req = fuse_request_alloc(0); if (!init_req) goto err_put_root; if (is_bdev) { - fc->destroy_req = fuse_request_alloc(); + fc->destroy_req = fuse_request_alloc(0); if (!fc->destroy_req) goto err_free_init_req; } diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 30de4f2a2ea9..24f414f0ce61 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -51,7 +51,7 @@ static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, continue; if (gfs2_is_jdata(ip)) set_buffer_uptodate(bh); - gfs2_trans_add_bh(ip->i_gl, bh, 0); + gfs2_trans_add_data(ip->i_gl, bh); } } @@ -230,16 +230,14 @@ out_ignore: } /** - * gfs2_writeback_writepages - Write a bunch of dirty pages back to disk + * gfs2_writepages - Write a bunch of dirty pages back to disk * @mapping: The mapping to write * @wbc: Write-back control * - * For the data=writeback case we can already ignore buffer heads - * and write whole extents at once. This is a big reduction in the - * number of I/O requests we send and the bmap calls we make in this case. + * Used for both ordered and writeback modes. */ -static int gfs2_writeback_writepages(struct address_space *mapping, - struct writeback_control *wbc) +static int gfs2_writepages(struct address_space *mapping, + struct writeback_control *wbc) { return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc); } @@ -852,7 +850,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, goto failed; } - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); if (gfs2_is_stuffed(ip)) return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page); @@ -1102,7 +1100,7 @@ cannot_release: static const struct address_space_operations gfs2_writeback_aops = { .writepage = gfs2_writeback_writepage, - .writepages = gfs2_writeback_writepages, + .writepages = gfs2_writepages, .readpage = gfs2_readpage, .readpages = gfs2_readpages, .write_begin = gfs2_write_begin, @@ -1118,6 +1116,7 @@ static const struct address_space_operations gfs2_writeback_aops = { static const struct address_space_operations gfs2_ordered_aops = { .writepage = gfs2_ordered_writepage, + .writepages = gfs2_writepages, .readpage = gfs2_readpage, .readpages = gfs2_readpages, .write_begin = gfs2_write_begin, diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index a68e91bcef3d..df686d13a7d2 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -22,6 +22,7 @@ #include "meta_io.h" #include "quota.h" #include "rgrp.h" +#include "log.h" #include "super.h" #include "trans.h" #include "dir.h" @@ -93,7 +94,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, if (!gfs2_is_jdata(ip)) mark_buffer_dirty(bh); if (!gfs2_is_writeback(ip)) - gfs2_trans_add_bh(ip->i_gl, bh, 0); + gfs2_trans_add_data(ip->i_gl, bh); if (release) { unlock_page(page); @@ -153,7 +154,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page) /* Set up the pointer to the new block */ - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); di = (struct gfs2_dinode *)dibh->b_data; gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); @@ -405,7 +406,7 @@ static inline __be64 *gfs2_indirect_init(struct metapath *mp, BUG_ON(i < 1); BUG_ON(mp->mp_bh[i] != NULL); mp->mp_bh[i] = gfs2_meta_new(gl, bn); - gfs2_trans_add_bh(gl, mp->mp_bh[i], 1); + gfs2_trans_add_meta(gl, mp->mp_bh[i]); gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN); gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header)); ptr += offset; @@ -468,7 +469,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, BUG_ON(sheight < 1); BUG_ON(dibh == NULL); - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); if (height == sheight) { struct buffer_head *bh; @@ -544,7 +545,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, /* Branching from existing tree */ case ALLOC_GROW_DEPTH: if (i > 1 && i < height) - gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[i-1], 1); + gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]); for (; i < height && n > 0; i++, n--) gfs2_indirect_init(mp, ip->i_gl, i, mp->mp_list[i-1], bn++); @@ -556,7 +557,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, case ALLOC_DATA: BUG_ON(n > dblks); BUG_ON(mp->mp_bh[end_of_metadata] == NULL); - gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[end_of_metadata], 1); + gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]); dblks = n; ptr = metapointer(end_of_metadata, mp); dblock = bn; @@ -796,8 +797,8 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, down_write(&ip->i_rw_mutex); - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); + gfs2_trans_add_meta(ip->i_gl, bh); bstart = 0; blen = 0; @@ -981,7 +982,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from) } if (!gfs2_is_writeback(ip)) - gfs2_trans_add_bh(ip->i_gl, bh, 0); + gfs2_trans_add_data(ip->i_gl, bh); zero_user(page, offset, length); mark_buffer_dirty(bh); @@ -1046,7 +1047,7 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize) if (error) goto out; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); if (gfs2_is_stuffed(ip)) { gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize); @@ -1137,11 +1138,12 @@ static int trunc_end(struct gfs2_inode *ip) ip->i_height = 0; ip->i_goal = ip->i_no_addr; gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); + gfs2_ordered_del_inode(ip); } ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -1246,7 +1248,7 @@ static int do_grow(struct inode *inode, u64 size) i_size_write(inode, size); ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -1286,6 +1288,10 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize) inode_dio_wait(inode); + ret = gfs2_rs_alloc(GFS2_I(inode)); + if (ret) + return ret; + oldsize = inode->i_size; if (newsize >= oldsize) return do_grow(inode, newsize); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 9a35670fdc38..7179478e5a28 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -93,7 +93,7 @@ int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, struct buffer_head *bh; bh = gfs2_meta_new(ip->i_gl, block); - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); gfs2_metatype_set(bh, GFS2_METATYPE_JD, GFS2_FORMAT_JD); gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header)); *bhp = bh; @@ -127,7 +127,7 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf, if (error) return error; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size); if (ip->i_inode.i_size < offset + size) i_size_write(&ip->i_inode, offset + size); @@ -209,7 +209,7 @@ static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf, if (error) goto fail; - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); memcpy(bh->b_data + o, buf, amount); brelse(bh); @@ -231,7 +231,7 @@ out: i_size_write(&ip->i_inode, offset + copied); ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -647,7 +647,7 @@ static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh, return; } - gfs2_trans_add_bh(dip->i_gl, bh, 1); + gfs2_trans_add_meta(dip->i_gl, bh); /* If there is no prev entry, this is the first entry in the block. The de_rec_len is already as big as it needs to be. Just zero @@ -690,7 +690,7 @@ static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode, offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len)); totlen = be16_to_cpu(dent->de_rec_len); BUG_ON(offset + name->len > totlen); - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); ndent = (struct gfs2_dirent *)((char *)dent + offset); dent->de_rec_len = cpu_to_be16(offset); gfs2_qstr2dirent(name, totlen - offset, ndent); @@ -831,7 +831,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, return NULL; gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1); - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF); leaf = (struct gfs2_leaf *)bh->b_data; leaf->lf_depth = cpu_to_be16(depth); @@ -916,7 +916,7 @@ static int dir_make_exhash(struct inode *inode) /* We're done with the new leaf block, now setup the new hash table. */ - gfs2_trans_add_bh(dip->i_gl, dibh, 1); + gfs2_trans_add_meta(dip->i_gl, dibh); gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); lp = (__be64 *)(dibh->b_data + sizeof(struct gfs2_dinode)); @@ -976,7 +976,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) return 1; /* can't split */ } - gfs2_trans_add_bh(dip->i_gl, obh, 1); + gfs2_trans_add_meta(dip->i_gl, obh); nleaf = new_leaf(inode, &nbh, be16_to_cpu(oleaf->lf_depth) + 1); if (!nleaf) { @@ -1069,7 +1069,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) error = gfs2_meta_inode_buffer(dip, &dibh); if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) { - gfs2_trans_add_bh(dip->i_gl, dibh, 1); + gfs2_trans_add_meta(dip->i_gl, dibh); gfs2_add_inode_blocks(&dip->i_inode, 1); gfs2_dinode_out(dip, dibh->b_data); brelse(dibh); @@ -1622,7 +1622,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name) return error; } while(1); - gfs2_trans_add_bh(ip->i_gl, obh, 1); + gfs2_trans_add_meta(ip->i_gl, obh); leaf = new_leaf(inode, &bh, be16_to_cpu(oleaf->lf_depth)); if (!leaf) { @@ -1636,7 +1636,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name) error = gfs2_meta_inode_buffer(ip, &bh); if (error) return error; - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); gfs2_add_inode_blocks(&ip->i_inode, 1); gfs2_dinode_out(ip, bh->b_data); brelse(bh); @@ -1795,7 +1795,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, if (IS_ERR(dent)) return PTR_ERR(dent); - gfs2_trans_add_bh(dip->i_gl, bh, 1); + gfs2_trans_add_meta(dip->i_gl, bh); gfs2_inum_out(nip, dent); dent->de_type = cpu_to_be16(new_type); @@ -1804,7 +1804,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, error = gfs2_meta_inode_buffer(dip, &bh); if (error) return error; - gfs2_trans_add_bh(dip->i_gl, bh, 1); + gfs2_trans_add_meta(dip->i_gl, bh); } dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME; @@ -1917,7 +1917,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, if (error) goto out_end_trans; - gfs2_trans_add_bh(dip->i_gl, dibh, 1); + gfs2_trans_add_meta(dip->i_gl, dibh); /* On the last dealloc, make this a regular file in case we crash. (We don't want to free these blocks a second time.) */ if (last_dealloc) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 991ab2d484dd..06b7092a3f25 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -276,7 +276,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) error = gfs2_meta_inode_buffer(ip, &bh); if (error) goto out_trans_end; - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); ip->i_diskflags = new_flags; gfs2_dinode_out(ip, bh->b_data); brelse(bh); @@ -709,7 +709,7 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, if (unlikely(error)) return error; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); if (gfs2_is_stuffed(ip)) { error = gfs2_unstuff_dinode(ip, NULL); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 992c5c0cb504..cf3515546739 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -30,6 +30,7 @@ #include <linux/rculist_bl.h> #include <linux/bit_spinlock.h> #include <linux/percpu.h> +#include <linux/list_sort.h> #include "gfs2.h" #include "incore.h" @@ -1376,56 +1377,105 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) gfs2_glock_put(gl); } +static int glock_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct gfs2_glock *gla, *glb; -static int gfs2_shrink_glock_memory(struct shrinker *shrink, - struct shrink_control *sc) + gla = list_entry(a, struct gfs2_glock, gl_lru); + glb = list_entry(b, struct gfs2_glock, gl_lru); + + if (gla->gl_name.ln_number > glb->gl_name.ln_number) + return 1; + if (gla->gl_name.ln_number < glb->gl_name.ln_number) + return -1; + + return 0; +} + +/** + * gfs2_dispose_glock_lru - Demote a list of glocks + * @list: The list to dispose of + * + * Disposing of glocks may involve disk accesses, so that here we sort + * the glocks by number (i.e. disk location of the inodes) so that if + * there are any such accesses, they'll be sent in order (mostly). + * + * Must be called under the lru_lock, but may drop and retake this + * lock. While the lru_lock is dropped, entries may vanish from the + * list, but no new entries will appear on the list (since it is + * private) + */ + +static void gfs2_dispose_glock_lru(struct list_head *list) +__releases(&lru_lock) +__acquires(&lru_lock) { struct gfs2_glock *gl; - int may_demote; - int nr_skipped = 0; - int nr = sc->nr_to_scan; - gfp_t gfp_mask = sc->gfp_mask; - LIST_HEAD(skipped); - if (nr == 0) - goto out; + list_sort(NULL, list, glock_cmp); - if (!(gfp_mask & __GFP_FS)) - return -1; + while(!list_empty(list)) { + gl = list_entry(list->next, struct gfs2_glock, gl_lru); + list_del_init(&gl->gl_lru); + clear_bit(GLF_LRU, &gl->gl_flags); + gfs2_glock_hold(gl); + spin_unlock(&lru_lock); + spin_lock(&gl->gl_spin); + if (demote_ok(gl)) + handle_callback(gl, LM_ST_UNLOCKED, 0); + WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags)); + smp_mb__after_clear_bit(); + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gfs2_glock_put_nolock(gl); + spin_unlock(&gl->gl_spin); + spin_lock(&lru_lock); + } +} + +/** + * gfs2_scan_glock_lru - Scan the LRU looking for locks to demote + * @nr: The number of entries to scan + * + * This function selects the entries on the LRU which are able to + * be demoted, and then kicks off the process by calling + * gfs2_dispose_glock_lru() above. + */ + +static void gfs2_scan_glock_lru(int nr) +{ + struct gfs2_glock *gl; + LIST_HEAD(skipped); + LIST_HEAD(dispose); spin_lock(&lru_lock); while(nr && !list_empty(&lru_list)) { gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru); - list_del_init(&gl->gl_lru); - clear_bit(GLF_LRU, &gl->gl_flags); - atomic_dec(&lru_count); /* Test for being demotable */ if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { - gfs2_glock_hold(gl); - spin_unlock(&lru_lock); - spin_lock(&gl->gl_spin); - may_demote = demote_ok(gl); - if (may_demote) { - handle_callback(gl, LM_ST_UNLOCKED, 0); - nr--; - } - clear_bit(GLF_LOCK, &gl->gl_flags); - smp_mb__after_clear_bit(); - if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gfs2_glock_put_nolock(gl); - spin_unlock(&gl->gl_spin); - spin_lock(&lru_lock); + list_move(&gl->gl_lru, &dispose); + atomic_dec(&lru_count); + nr--; continue; } - nr_skipped++; - list_add(&gl->gl_lru, &skipped); - set_bit(GLF_LRU, &gl->gl_flags); + + list_move(&gl->gl_lru, &skipped); } list_splice(&skipped, &lru_list); - atomic_add(nr_skipped, &lru_count); + if (!list_empty(&dispose)) + gfs2_dispose_glock_lru(&dispose); spin_unlock(&lru_lock); -out: +} + +static int gfs2_shrink_glock_memory(struct shrinker *shrink, + struct shrink_control *sc) +{ + if (sc->nr_to_scan) { + if (!(sc->gfp_mask & __GFP_FS)) + return -1; + gfs2_scan_glock_lru(sc->nr_to_scan); + } + return (atomic_read(&lru_count) / 100) * sysctl_vfs_cache_pressure; } diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index c373a24fedd9..e2601ba38ef5 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -52,7 +52,6 @@ struct gfs2_log_header_host { */ struct gfs2_log_operations { - void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); void (*lo_before_commit) (struct gfs2_sbd *sdp); void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai); void (*lo_before_scan) (struct gfs2_jdesc *jd, @@ -341,6 +340,7 @@ enum { GIF_QD_LOCKED = 1, GIF_ALLOC_FAILED = 2, GIF_SW_PAGED = 3, + GIF_ORDERED = 4, }; struct gfs2_inode { @@ -357,6 +357,7 @@ struct gfs2_inode { struct gfs2_rgrpd *i_rgd; u64 i_goal; /* goal block for allocations */ struct rw_semaphore i_rw_mutex; + struct list_head i_ordered; struct list_head i_trunc_list; __be64 *i_hash_cache; u32 i_entries; @@ -641,6 +642,7 @@ struct gfs2_sbd { wait_queue_head_t sd_glock_wait; atomic_t sd_glock_disposal; struct completion sd_locking_init; + struct completion sd_wdack; struct delayed_work sd_control_work; /* Inode Stuff */ @@ -723,6 +725,7 @@ struct gfs2_sbd { struct list_head sd_log_le_revoke; struct list_head sd_log_le_databuf; struct list_head sd_log_le_ordered; + spinlock_t sd_ordered_lock; atomic_t sd_log_thresh1; atomic_t sd_log_thresh2; @@ -758,10 +761,7 @@ struct gfs2_sbd { unsigned int sd_replayed_blocks; /* For quiescing the filesystem */ - struct gfs2_holder sd_freeze_gh; - struct mutex sd_freeze_lock; - unsigned int sd_freeze_count; char sd_fsname[GFS2_FSNAME_LEN]; char sd_table_name[GFS2_FSNAME_LEN]; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 2b6f5698ef18..db048a8ab6a8 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -447,7 +447,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip, struct timespec tv = CURRENT_TIME; dibh = gfs2_meta_new(ip->i_gl, ip->i_no_addr); - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI); gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); di = (struct gfs2_dinode *)dibh->b_data; @@ -584,7 +584,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, if (error) goto fail_end_trans; set_nlink(&ip->i_inode, S_ISDIR(ip->i_inode.i_mode) ? 2 : 1); - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); return 0; @@ -931,7 +931,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (error) goto out_brelse; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); inc_nlink(&ip->i_inode); ip->i_inode.i_ctime = CURRENT_TIME; ihold(inode); @@ -1412,7 +1412,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (error) goto out_end_trans; ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); } diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index b906ed17a839..9802de0f85e6 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -281,6 +281,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_sbd; struct lm_lockstruct *ls = &sdp->sd_lockstruct; + int lvb_needs_unlock = 0; int error; if (gl->gl_lksb.sb_lkid == 0) { @@ -294,8 +295,12 @@ static void gdlm_put_lock(struct gfs2_glock *gl) gfs2_update_request_times(gl); /* don't want to skip dlm_unlock writing the lvb when lock is ex */ + + if (gl->gl_lksb.sb_lvbptr && (gl->gl_state == LM_ST_EXCLUSIVE)) + lvb_needs_unlock = 1; + if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) && - gl->gl_lksb.sb_lvbptr && (gl->gl_state != LM_ST_EXCLUSIVE)) { + !lvb_needs_unlock) { gfs2_glock_free(gl); return; } diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index f4beeb9c81c1..9a2ca8be7647 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -482,70 +482,66 @@ static void log_flush_wait(struct gfs2_sbd *sdp) } } -static int bd_cmp(void *priv, struct list_head *a, struct list_head *b) +static int ip_cmp(void *priv, struct list_head *a, struct list_head *b) { - struct gfs2_bufdata *bda, *bdb; + struct gfs2_inode *ipa, *ipb; - bda = list_entry(a, struct gfs2_bufdata, bd_list); - bdb = list_entry(b, struct gfs2_bufdata, bd_list); + ipa = list_entry(a, struct gfs2_inode, i_ordered); + ipb = list_entry(b, struct gfs2_inode, i_ordered); - if (bda->bd_bh->b_blocknr < bdb->bd_bh->b_blocknr) + if (ipa->i_no_addr < ipb->i_no_addr) return -1; - if (bda->bd_bh->b_blocknr > bdb->bd_bh->b_blocknr) + if (ipa->i_no_addr > ipb->i_no_addr) return 1; return 0; } static void gfs2_ordered_write(struct gfs2_sbd *sdp) { - struct gfs2_bufdata *bd; - struct buffer_head *bh; + struct gfs2_inode *ip; LIST_HEAD(written); - gfs2_log_lock(sdp); - list_sort(NULL, &sdp->sd_log_le_ordered, &bd_cmp); + spin_lock(&sdp->sd_ordered_lock); + list_sort(NULL, &sdp->sd_log_le_ordered, &ip_cmp); while (!list_empty(&sdp->sd_log_le_ordered)) { - bd = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_bufdata, bd_list); - list_move(&bd->bd_list, &written); - bh = bd->bd_bh; - if (!buffer_dirty(bh)) + ip = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_inode, i_ordered); + list_move(&ip->i_ordered, &written); + if (ip->i_inode.i_mapping->nrpages == 0) continue; - get_bh(bh); - gfs2_log_unlock(sdp); - lock_buffer(bh); - if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) { - bh->b_end_io = end_buffer_write_sync; - submit_bh(WRITE_SYNC, bh); - } else { - unlock_buffer(bh); - brelse(bh); - } - gfs2_log_lock(sdp); + spin_unlock(&sdp->sd_ordered_lock); + filemap_fdatawrite(ip->i_inode.i_mapping); + spin_lock(&sdp->sd_ordered_lock); } list_splice(&written, &sdp->sd_log_le_ordered); - gfs2_log_unlock(sdp); + spin_unlock(&sdp->sd_ordered_lock); } static void gfs2_ordered_wait(struct gfs2_sbd *sdp) { - struct gfs2_bufdata *bd; - struct buffer_head *bh; + struct gfs2_inode *ip; - gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ordered_lock); while (!list_empty(&sdp->sd_log_le_ordered)) { - bd = list_entry(sdp->sd_log_le_ordered.prev, struct gfs2_bufdata, bd_list); - bh = bd->bd_bh; - if (buffer_locked(bh)) { - get_bh(bh); - gfs2_log_unlock(sdp); - wait_on_buffer(bh); - brelse(bh); - gfs2_log_lock(sdp); + ip = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_inode, i_ordered); + list_del(&ip->i_ordered); + WARN_ON(!test_and_clear_bit(GIF_ORDERED, &ip->i_flags)); + if (ip->i_inode.i_mapping->nrpages == 0) continue; - } - list_del_init(&bd->bd_list); + spin_unlock(&sdp->sd_ordered_lock); + filemap_fdatawait(ip->i_inode.i_mapping); + spin_lock(&sdp->sd_ordered_lock); } - gfs2_log_unlock(sdp); + spin_unlock(&sdp->sd_ordered_lock); +} + +void gfs2_ordered_del_inode(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + + spin_lock(&sdp->sd_ordered_lock); + if (test_and_clear_bit(GIF_ORDERED, &ip->i_flags)) + list_del(&ip->i_ordered); + spin_unlock(&sdp->sd_ordered_lock); } /** diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 3fd5215ea25f..3566f35915e0 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -48,6 +48,18 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp, sdp->sd_log_head = sdp->sd_log_tail = value; } +static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + + if (!test_bit(GIF_ORDERED, &ip->i_flags)) { + spin_lock(&sdp->sd_ordered_lock); + if (!test_and_set_bit(GIF_ORDERED, &ip->i_flags)) + list_add(&ip->i_ordered, &sdp->sd_log_le_ordered); + spin_unlock(&sdp->sd_ordered_lock); + } +} +extern void gfs2_ordered_del_inode(struct gfs2_inode *ip); extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, unsigned int ssize); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 9ceccb1595a3..a5055977a214 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -37,7 +37,7 @@ * * The log lock must be held when calling this function */ -static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh) +void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh) { struct gfs2_bufdata *bd; @@ -388,32 +388,6 @@ static struct page *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type, return page; } -static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) -{ - struct gfs2_meta_header *mh; - struct gfs2_trans *tr; - - tr = current->journal_info; - tr->tr_touched = 1; - if (!list_empty(&bd->bd_list)) - return; - set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); - set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); - mh = (struct gfs2_meta_header *)bd->bd_bh->b_data; - if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) { - printk(KERN_ERR - "Attempting to add uninitialised block to journal (inplace block=%lld)\n", - (unsigned long long)bd->bd_bh->b_blocknr); - BUG(); - } - gfs2_pin(sdp, bd->bd_bh); - mh->__pad0 = cpu_to_be64(0); - mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); - sdp->sd_log_num_buf++; - list_add(&bd->bd_list, &sdp->sd_log_le_buf); - tr->tr_num_buf_new++; -} - static void gfs2_check_magic(struct buffer_head *bh) { void *kaddr; @@ -600,20 +574,6 @@ static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks); } -static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) -{ - struct gfs2_glock *gl = bd->bd_gl; - struct gfs2_trans *tr; - - tr = current->journal_info; - tr->tr_touched = 1; - tr->tr_num_revoke++; - sdp->sd_log_num_revoke++; - atomic_inc(&gl->gl_revokes); - set_bit(GLF_LFLUSH, &gl->gl_flags); - list_add(&bd->bd_list, &sdp->sd_log_le_revoke); -} - static void revoke_lo_before_commit(struct gfs2_sbd *sdp) { struct gfs2_meta_header *mh; @@ -749,44 +709,6 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) } /** - * databuf_lo_add - Add a databuf to the transaction. - * - * This is used in two distinct cases: - * i) In ordered write mode - * We put the data buffer on a list so that we can ensure that its - * synced to disk at the right time - * ii) In journaled data mode - * We need to journal the data block in the same way as metadata in - * the functions above. The difference is that here we have a tag - * which is two __be64's being the block number (as per meta data) - * and a flag which says whether the data block needs escaping or - * not. This means we need a new log entry for each 251 or so data - * blocks, which isn't an enormous overhead but twice as much as - * for normal metadata blocks. - */ -static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) -{ - struct gfs2_trans *tr = current->journal_info; - struct address_space *mapping = bd->bd_bh->b_page->mapping; - struct gfs2_inode *ip = GFS2_I(mapping->host); - - if (tr) - tr->tr_touched = 1; - if (!list_empty(&bd->bd_list)) - return; - set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); - set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); - if (gfs2_is_jdata(ip)) { - gfs2_pin(sdp, bd->bd_bh); - tr->tr_num_databuf_new++; - sdp->sd_log_num_databuf++; - list_add_tail(&bd->bd_list, &sdp->sd_log_le_databuf); - } else { - list_add_tail(&bd->bd_list, &sdp->sd_log_le_ordered); - } -} - -/** * databuf_lo_before_commit - Scan the data buffers, writing as we go * */ @@ -885,7 +807,6 @@ static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) const struct gfs2_log_operations gfs2_buf_lops = { - .lo_add = buf_lo_add, .lo_before_commit = buf_lo_before_commit, .lo_after_commit = buf_lo_after_commit, .lo_before_scan = buf_lo_before_scan, @@ -895,7 +816,6 @@ const struct gfs2_log_operations gfs2_buf_lops = { }; const struct gfs2_log_operations gfs2_revoke_lops = { - .lo_add = revoke_lo_add, .lo_before_commit = revoke_lo_before_commit, .lo_after_commit = revoke_lo_after_commit, .lo_before_scan = revoke_lo_before_scan, @@ -909,7 +829,6 @@ const struct gfs2_log_operations gfs2_rg_lops = { }; const struct gfs2_log_operations gfs2_databuf_lops = { - .lo_add = databuf_lo_add, .lo_before_commit = databuf_lo_before_commit, .lo_after_commit = databuf_lo_after_commit, .lo_scan_elements = databuf_lo_scan_elements, diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index 954a330585f4..ba77b7da8325 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -29,6 +29,7 @@ extern const struct gfs2_log_operations gfs2_databuf_lops; extern const struct gfs2_log_operations *gfs2_log_ops[]; extern void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page); extern void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int rw); +extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); static inline unsigned int buf_limit(struct gfs2_sbd *sdp) { @@ -46,19 +47,6 @@ static inline unsigned int databuf_limit(struct gfs2_sbd *sdp) return limit; } -static inline void lops_init_le(struct gfs2_bufdata *bd, - const struct gfs2_log_operations *lops) -{ - INIT_LIST_HEAD(&bd->bd_list); - bd->bd_ops = lops; -} - -static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) -{ - if (bd->bd_ops->lo_add) - bd->bd_ops->lo_add(sdp, bd); -} - static inline void lops_before_commit(struct gfs2_sbd *sdp) { int x; diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 22255d96b27e..b059bbb5059e 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -271,41 +271,6 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh) return 0; } -/** - * gfs2_attach_bufdata - attach a struct gfs2_bufdata structure to a buffer - * @gl: the glock the buffer belongs to - * @bh: The buffer to be attached to - * @meta: Flag to indicate whether its metadata or not - */ - -void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh, - int meta) -{ - struct gfs2_bufdata *bd; - - if (meta) - lock_page(bh->b_page); - - if (bh->b_private) { - if (meta) - unlock_page(bh->b_page); - return; - } - - bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL); - bd->bd_bh = bh; - bd->bd_gl = gl; - - if (meta) - lops_init_le(bd, &gfs2_buf_lops); - else - lops_init_le(bd, &gfs2_databuf_lops); - bh->b_private = bd; - - if (meta) - unlock_page(bh->b_page); -} - void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta) { struct address_space *mapping = bh->b_page->mapping; diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h index c30973b07a7c..0d4c843b6f8e 100644 --- a/fs/gfs2/meta_io.h +++ b/fs/gfs2/meta_io.h @@ -56,9 +56,6 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh); struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create); -void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh, - int meta); - void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 0e3554edb8f2..1b612be4b873 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -81,6 +81,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) init_waitqueue_head(&sdp->sd_glock_wait); atomic_set(&sdp->sd_glock_disposal, 0); init_completion(&sdp->sd_locking_init); + init_completion(&sdp->sd_wdack); spin_lock_init(&sdp->sd_statfs_spin); spin_lock_init(&sdp->sd_rindex_spin); @@ -102,6 +103,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) INIT_LIST_HEAD(&sdp->sd_log_le_revoke); INIT_LIST_HEAD(&sdp->sd_log_le_databuf); INIT_LIST_HEAD(&sdp->sd_log_le_ordered); + spin_lock_init(&sdp->sd_ordered_lock); init_waitqueue_head(&sdp->sd_log_waitq); init_waitqueue_head(&sdp->sd_logd_waitq); @@ -115,8 +117,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) INIT_LIST_HEAD(&sdp->sd_revoke_list); - mutex_init(&sdp->sd_freeze_lock); - return sdp; } diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index ae55e248c3b7..06122d09c0d1 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -590,7 +590,7 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change) s64 x; mutex_lock(&sdp->sd_quota_mutex); - gfs2_trans_add_bh(ip->i_gl, qd->qd_bh, 1); + gfs2_trans_add_meta(ip->i_gl, qd->qd_bh); if (!test_bit(QDF_CHANGE, &qd->qd_flags)) { qc->qc_change = 0; @@ -726,7 +726,7 @@ get_a_page: goto unlock_out; } - gfs2_trans_add_bh(ip->i_gl, bh, 0); + gfs2_trans_add_meta(ip->i_gl, bh); kaddr = kmap_atomic(page); if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE) diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index b7eff078fe90..52c2aeaf45ce 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1323,7 +1323,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp) if (ret == 0) { bh = rgd->rd_bits[0].bi_bh; rgd->rd_flags |= GFS2_RGF_TRIMMED; - gfs2_trans_add_bh(rgd->rd_gl, bh, 1); + gfs2_trans_add_meta(rgd->rd_gl, bh); gfs2_rgrp_out(rgd, bh->b_data); gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, bh->b_data); gfs2_trans_end(sdp); @@ -1968,14 +1968,14 @@ static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode, *n = 1; block = gfs2_rbm_to_block(rbm); - gfs2_trans_add_bh(rbm->rgd->rd_gl, rbm->bi->bi_bh, 1); + gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm->bi->bi_bh); gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED); block++; while (*n < elen) { ret = gfs2_rbm_from_block(&pos, block); if (ret || gfs2_testbit(&pos) != GFS2_BLKST_FREE) break; - gfs2_trans_add_bh(pos.rgd->rd_gl, pos.bi->bi_bh, 1); + gfs2_trans_add_meta(pos.rgd->rd_gl, pos.bi->bi_bh); gfs2_setbit(&pos, true, GFS2_BLKST_USED); (*n)++; block++; @@ -2014,7 +2014,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, rbm.bi->bi_bh->b_data + rbm.bi->bi_offset, rbm.bi->bi_len); } - gfs2_trans_add_bh(rbm.rgd->rd_gl, rbm.bi->bi_bh, 1); + gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.bi->bi_bh); gfs2_setbit(&rbm, false, new_state); } @@ -2157,7 +2157,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, if (error == 0) { struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_goal); brelse(dibh); @@ -2176,7 +2176,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, *generation = rbm.rgd->rd_igeneration++; } - gfs2_trans_add_bh(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh, 1); + gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh); gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data); gfs2_rgrp_ondisk2lvb(rbm.rgd->rd_rgl, rbm.rgd->rd_bits[0].bi_bh->b_data); @@ -2223,7 +2223,7 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta) trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE); rgd->rd_free += blen; rgd->rd_flags &= ~GFS2_RGF_TRIMMED; - gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); + gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); @@ -2260,7 +2260,7 @@ void gfs2_unlink_di(struct inode *inode) if (!rgd) return; trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED); - gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); + gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); update_rgrp_lvb_unlinked(rgd, 1); @@ -2281,7 +2281,7 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno) rgd->rd_dinodes--; rgd->rd_free++; - gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); + gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); update_rgrp_lvb_unlinked(rgd, -1); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index d6488674d916..a3b40eeaa6e2 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -500,7 +500,7 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, if (error) return; - gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1); + gfs2_trans_add_meta(l_ip->i_gl, l_bh); spin_lock(&sdp->sd_statfs_spin); l_sc->sc_total += total; @@ -528,7 +528,7 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; - gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1); + gfs2_trans_add_meta(l_ip->i_gl, l_bh); spin_lock(&sdp->sd_statfs_spin); m_sc->sc_total += l_sc->sc_total; @@ -539,7 +539,7 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, 0, sizeof(struct gfs2_statfs_change)); spin_unlock(&sdp->sd_statfs_spin); - gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1); + gfs2_trans_add_meta(m_ip->i_gl, m_bh); gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode)); } @@ -663,54 +663,6 @@ out: return error; } -/** - * gfs2_freeze_fs - freezes the file system - * @sdp: the file system - * - * This function flushes data and meta data for all machines by - * acquiring the transaction log exclusively. All journals are - * ensured to be in a clean state as well. - * - * Returns: errno - */ - -int gfs2_freeze_fs(struct gfs2_sbd *sdp) -{ - int error = 0; - - mutex_lock(&sdp->sd_freeze_lock); - - if (!sdp->sd_freeze_count++) { - error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh); - if (error) - sdp->sd_freeze_count--; - } - - mutex_unlock(&sdp->sd_freeze_lock); - - return error; -} - -/** - * gfs2_unfreeze_fs - unfreezes the file system - * @sdp: the file system - * - * This function allows the file system to proceed by unlocking - * the exclusively held transaction lock. Other GFS2 nodes are - * now free to acquire the lock shared and go on with their lives. - * - */ - -void gfs2_unfreeze_fs(struct gfs2_sbd *sdp) -{ - mutex_lock(&sdp->sd_freeze_lock); - - if (sdp->sd_freeze_count && !--sdp->sd_freeze_count) - gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); - - mutex_unlock(&sdp->sd_freeze_lock); -} - void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) { struct gfs2_dinode *str = buf; @@ -824,7 +776,7 @@ static void gfs2_dirty_inode(struct inode *inode, int flags) ret = gfs2_meta_inode_buffer(ip, &bh); if (ret == 0) { - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); gfs2_dinode_out(ip, bh->b_data); brelse(bh); } @@ -888,13 +840,6 @@ static void gfs2_put_super(struct super_block *sb) int error; struct gfs2_jdesc *jd; - /* Unfreeze the filesystem, if we need to */ - - mutex_lock(&sdp->sd_freeze_lock); - if (sdp->sd_freeze_count) - gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); - mutex_unlock(&sdp->sd_freeze_lock); - /* No more recovery requests */ set_bit(SDF_NORECOVERY, &sdp->sd_flags); smp_mb(); @@ -985,7 +930,7 @@ static int gfs2_freeze(struct super_block *sb) return -EINVAL; for (;;) { - error = gfs2_freeze_fs(sdp); + error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh); if (!error) break; @@ -1013,7 +958,9 @@ static int gfs2_freeze(struct super_block *sb) static int gfs2_unfreeze(struct super_block *sb) { - gfs2_unfreeze_fs(sb->s_fs_info); + struct gfs2_sbd *sdp = sb->s_fs_info; + + gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); return 0; } @@ -1577,6 +1524,7 @@ out: /* Case 3 starts here */ truncate_inode_pages(&inode->i_data, 0); gfs2_rs_delete(ip); + gfs2_ordered_del_inode(ip); clear_inode(inode); gfs2_dir_hash_inval(ip); ip->i_gl->gl_object = NULL; diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index a0464680af0b..90e3322ffa10 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -46,9 +46,6 @@ extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, struct buffer_head *l_bh); extern int gfs2_statfs_sync(struct super_block *sb, int type); -extern int gfs2_freeze_fs(struct gfs2_sbd *sdp); -extern void gfs2_unfreeze_fs(struct gfs2_sbd *sdp); - extern struct file_system_type gfs2_fs_type; extern struct file_system_type gfs2meta_fs_type; extern const struct export_operations gfs2_export_ops; diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 8056b7b7238e..4fb9ad80d260 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -91,19 +91,15 @@ static ssize_t uuid_show(struct gfs2_sbd *sdp, char *buf) static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf) { - unsigned int count; - - mutex_lock(&sdp->sd_freeze_lock); - count = sdp->sd_freeze_count; - mutex_unlock(&sdp->sd_freeze_lock); + struct super_block *sb = sdp->sd_vfs; + int frozen = (sb->s_writers.frozen == SB_UNFROZEN) ? 0 : 1; - return snprintf(buf, PAGE_SIZE, "%u\n", count); + return snprintf(buf, PAGE_SIZE, "%u\n", frozen); } static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { - ssize_t ret = len; - int error = 0; + int error; int n = simple_strtol(buf, NULL, 0); if (!capable(CAP_SYS_ADMIN)) @@ -111,19 +107,21 @@ static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len) switch (n) { case 0: - gfs2_unfreeze_fs(sdp); + error = thaw_super(sdp->sd_vfs); break; case 1: - error = gfs2_freeze_fs(sdp); + error = freeze_super(sdp->sd_vfs); break; default: - ret = -EINVAL; + return -EINVAL; } - if (error) + if (error) { fs_warn(sdp, "freeze %d error %d", n, error); + return error; + } - return ret; + return len; } static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf) @@ -332,6 +330,28 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len) return ret; } +static ssize_t wdack_show(struct gfs2_sbd *sdp, char *buf) +{ + int val = completion_done(&sdp->sd_wdack) ? 1 : 0; + + return sprintf(buf, "%d\n", val); +} + +static ssize_t wdack_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + ssize_t ret = len; + int val; + + val = simple_strtol(buf, NULL, 0); + + if ((val == 1) && + !strcmp(sdp->sd_lockstruct.ls_ops->lm_proto_name, "lock_dlm")) + complete(&sdp->sd_wdack); + else + ret = -EINVAL; + return ret; +} + static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; @@ -463,7 +483,7 @@ static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store) GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); GDLM_ATTR(block, 0644, block_show, block_store); -GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store); +GDLM_ATTR(withdraw, 0644, wdack_show, wdack_store); GDLM_ATTR(jid, 0644, jid_show, jid_store); GDLM_ATTR(first, 0644, lkfirst_show, lkfirst_store); GDLM_ATTR(first_done, 0444, first_done_show, NULL); diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 413627072f36..88162fae27a5 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -18,6 +18,7 @@ #include "gfs2.h" #include "incore.h" #include "glock.h" +#include "inode.h" #include "log.h" #include "lops.h" #include "meta_io.h" @@ -142,44 +143,143 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) sb_end_intwrite(sdp->sd_vfs); } +static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl, + struct buffer_head *bh, + const struct gfs2_log_operations *lops) +{ + struct gfs2_bufdata *bd; + + bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL); + bd->bd_bh = bh; + bd->bd_gl = gl; + bd->bd_ops = lops; + INIT_LIST_HEAD(&bd->bd_list); + bh->b_private = bd; + return bd; +} + /** - * gfs2_trans_add_bh - Add a to-be-modified buffer to the current transaction - * @gl: the glock the buffer belongs to + * gfs2_trans_add_data - Add a databuf to the transaction. + * @gl: The inode glock associated with the buffer * @bh: The buffer to add - * @meta: True in the case of adding metadata * + * This is used in two distinct cases: + * i) In ordered write mode + * We put the data buffer on a list so that we can ensure that its + * synced to disk at the right time + * ii) In journaled data mode + * We need to journal the data block in the same way as metadata in + * the functions above. The difference is that here we have a tag + * which is two __be64's being the block number (as per meta data) + * and a flag which says whether the data block needs escaping or + * not. This means we need a new log entry for each 251 or so data + * blocks, which isn't an enormous overhead but twice as much as + * for normal metadata blocks. */ +void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh) +{ + struct gfs2_trans *tr = current->journal_info; + struct gfs2_sbd *sdp = gl->gl_sbd; + struct address_space *mapping = bh->b_page->mapping; + struct gfs2_inode *ip = GFS2_I(mapping->host); + struct gfs2_bufdata *bd; -void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta) + if (!gfs2_is_jdata(ip)) { + gfs2_ordered_add_inode(ip); + return; + } + + lock_buffer(bh); + gfs2_log_lock(sdp); + bd = bh->b_private; + if (bd == NULL) { + gfs2_log_unlock(sdp); + unlock_buffer(bh); + if (bh->b_private == NULL) + bd = gfs2_alloc_bufdata(gl, bh, &gfs2_databuf_lops); + lock_buffer(bh); + gfs2_log_lock(sdp); + } + gfs2_assert(sdp, bd->bd_gl == gl); + tr->tr_touched = 1; + if (list_empty(&bd->bd_list)) { + set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); + set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); + gfs2_pin(sdp, bd->bd_bh); + tr->tr_num_databuf_new++; + sdp->sd_log_num_databuf++; + list_add_tail(&bd->bd_list, &sdp->sd_log_le_databuf); + } + gfs2_log_unlock(sdp); + unlock_buffer(bh); +} + +static void meta_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) { + struct gfs2_meta_header *mh; + struct gfs2_trans *tr; + + tr = current->journal_info; + tr->tr_touched = 1; + if (!list_empty(&bd->bd_list)) + return; + set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); + set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); + mh = (struct gfs2_meta_header *)bd->bd_bh->b_data; + if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) { + printk(KERN_ERR + "Attempting to add uninitialised block to journal (inplace block=%lld)\n", + (unsigned long long)bd->bd_bh->b_blocknr); + BUG(); + } + gfs2_pin(sdp, bd->bd_bh); + mh->__pad0 = cpu_to_be64(0); + mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); + sdp->sd_log_num_buf++; + list_add(&bd->bd_list, &sdp->sd_log_le_buf); + tr->tr_num_buf_new++; +} + +void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; struct gfs2_bufdata *bd; lock_buffer(bh); gfs2_log_lock(sdp); bd = bh->b_private; - if (bd) - gfs2_assert(sdp, bd->bd_gl == gl); - else { + if (bd == NULL) { gfs2_log_unlock(sdp); unlock_buffer(bh); - gfs2_attach_bufdata(gl, bh, meta); - bd = bh->b_private; + lock_page(bh->b_page); + if (bh->b_private == NULL) + bd = gfs2_alloc_bufdata(gl, bh, &gfs2_buf_lops); + unlock_page(bh->b_page); lock_buffer(bh); gfs2_log_lock(sdp); } - lops_add(sdp, bd); + gfs2_assert(sdp, bd->bd_gl == gl); + meta_lo_add(sdp, bd); gfs2_log_unlock(sdp); unlock_buffer(bh); } void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) { + struct gfs2_glock *gl = bd->bd_gl; + struct gfs2_trans *tr = current->journal_info; + BUG_ON(!list_empty(&bd->bd_list)); BUG_ON(!list_empty(&bd->bd_ail_st_list)); BUG_ON(!list_empty(&bd->bd_ail_gl_list)); - lops_init_le(bd, &gfs2_revoke_lops); - lops_add(sdp, bd); + bd->bd_ops = &gfs2_revoke_lops; + tr->tr_touched = 1; + tr->tr_num_revoke++; + sdp->sd_log_num_revoke++; + atomic_inc(&gl->gl_revokes); + set_bit(GLF_LFLUSH, &gl->gl_flags); + list_add(&bd->bd_list, &sdp->sd_log_le_revoke); } void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len) diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h index bf2ae9aeee7a..1e6e7da25a17 100644 --- a/fs/gfs2/trans.h +++ b/fs/gfs2/trans.h @@ -39,7 +39,8 @@ extern int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, unsigned int revokes); extern void gfs2_trans_end(struct gfs2_sbd *sdp); -extern void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta); +extern void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh); +extern void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh); extern void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); extern void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len); diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index f00d7c5744f6..6402fb69d71b 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -54,6 +54,9 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...) kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE); + if (!strcmp(sdp->sd_lockstruct.ls_ops->lm_proto_name, "lock_dlm")) + wait_for_completion(&sdp->sd_wdack); + if (lm->lm_unmount) { fs_err(sdp, "telling LM to unmount\n"); lm->lm_unmount(sdp); diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 76c144b3c9bb..cbb46c2baa69 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -270,7 +270,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, if (error) goto out_gunlock; - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); dataptrs = GFS2_EA2DATAPTRS(ea); for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) { @@ -309,7 +309,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); } @@ -509,7 +509,7 @@ static int gfs2_iter_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea, } if (din) { - gfs2_trans_add_bh(ip->i_gl, bh[x], 1); + gfs2_trans_add_meta(ip->i_gl, bh[x]); memcpy(pos, din, cp_size); din += sdp->sd_jbsize; } @@ -629,7 +629,7 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp) return error; gfs2_trans_add_unrevoke(sdp, block, 1); *bhp = gfs2_meta_new(ip->i_gl, block); - gfs2_trans_add_bh(ip->i_gl, *bhp, 1); + gfs2_trans_add_meta(ip->i_gl, *bhp); gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA); gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header)); @@ -691,7 +691,7 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea, return error; gfs2_trans_add_unrevoke(sdp, block, 1); bh = gfs2_meta_new(ip->i_gl, block); - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED); gfs2_add_inode_blocks(&ip->i_inode, 1); @@ -751,7 +751,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); } @@ -834,7 +834,7 @@ static void ea_set_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_header *prev = el->el_prev; u32 len; - gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1); + gfs2_trans_add_meta(ip->i_gl, el->el_bh); if (!prev || !GFS2_EA_IS_STUFFED(ea)) { ea->ea_type = GFS2_EATYPE_UNUSED; @@ -872,7 +872,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh, if (error) return error; - gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_trans_add_meta(ip->i_gl, bh); if (es->ea_split) ea = ea_split_ea(ea); @@ -886,7 +886,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh, if (error) goto out; ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); out: @@ -901,7 +901,7 @@ static int ea_set_simple_alloc(struct gfs2_inode *ip, struct gfs2_ea_header *ea = es->es_ea; int error; - gfs2_trans_add_bh(ip->i_gl, es->es_bh, 1); + gfs2_trans_add_meta(ip->i_gl, es->es_bh); if (es->ea_split) ea = ea_split_ea(ea); @@ -997,7 +997,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er, goto out; } - gfs2_trans_add_bh(ip->i_gl, indbh, 1); + gfs2_trans_add_meta(ip->i_gl, indbh); } else { u64 blk; unsigned int n = 1; @@ -1006,7 +1006,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er, return error; gfs2_trans_add_unrevoke(sdp, blk, 1); indbh = gfs2_meta_new(ip->i_gl, blk); - gfs2_trans_add_bh(ip->i_gl, indbh, 1); + gfs2_trans_add_meta(ip->i_gl, indbh); gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN); gfs2_buffer_clear_tail(indbh, mh_size); @@ -1092,7 +1092,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) if (error) return error; - gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1); + gfs2_trans_add_meta(ip->i_gl, el->el_bh); if (prev) { u32 len; @@ -1109,7 +1109,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); } @@ -1265,7 +1265,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) if (GFS2_EA_IS_STUFFED(el.el_ea)) { error = gfs2_trans_begin(sdp, RES_DINODE + RES_EATTR, 0); if (error == 0) { - gfs2_trans_add_bh(ip->i_gl, el.el_bh, 1); + gfs2_trans_add_meta(ip->i_gl, el.el_bh); memcpy(GFS2_EA2DATA(el.el_ea), data, GFS2_EA_DATA_LEN(el.el_ea)); } @@ -1352,7 +1352,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) if (error) goto out_gunlock; - gfs2_trans_add_bh(ip->i_gl, indbh, 1); + gfs2_trans_add_meta(ip->i_gl, indbh); eablk = (__be64 *)(indbh->b_data + sizeof(struct gfs2_meta_header)); bstart = 0; @@ -1384,7 +1384,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); } @@ -1434,7 +1434,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip) error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { - gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); } diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index a2862339323b..81cc7eaff863 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -446,7 +446,8 @@ int __log_start_commit(journal_t *journal, tid_t target) * currently running transaction (if it exists). Otherwise, * the target tid must be an old one. */ - if (journal->j_running_transaction && + if (journal->j_commit_request != target && + journal->j_running_transaction && journal->j_running_transaction->t_tid == target) { /* * We want a new commit: OK, mark the request and wakeup the diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 54f9e6ce0430..52e5120bb159 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -550,6 +550,9 @@ again: status = nlmclnt_block(block, req, NLMCLNT_POLL_TIMEOUT); if (status < 0) break; + /* Resend the blocking lock request after a server reboot */ + if (resp->status == nlm_lck_denied_grace_period) + continue; if (resp->status != nlm_lck_blocked) break; } diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 4fa788c93f46..434b93ec0970 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -1273,6 +1273,7 @@ static const struct nfs_pageio_ops bl_pg_write_ops = { static struct pnfs_layoutdriver_type blocklayout_type = { .id = LAYOUT_BLOCK_VOLUME, .name = "LAYOUT_BLOCK_VOLUME", + .owner = THIS_MODULE, .read_pagelist = bl_read_pagelist, .write_pagelist = bl_write_pagelist, .alloc_layout_hdr = bl_alloc_layout_hdr, diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index c89b26bc9759..2960512792c2 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -183,60 +183,15 @@ static u32 initiate_file_draining(struct nfs_client *clp, static u32 initiate_bulk_draining(struct nfs_client *clp, struct cb_layoutrecallargs *args) { - struct nfs_server *server; - struct pnfs_layout_hdr *lo; - struct inode *ino; - u32 rv = NFS4ERR_NOMATCHING_LAYOUT; - struct pnfs_layout_hdr *tmp; - LIST_HEAD(recall_list); - LIST_HEAD(free_me_list); - struct pnfs_layout_range range = { - .iomode = IOMODE_ANY, - .offset = 0, - .length = NFS4_MAX_UINT64, - }; - - spin_lock(&clp->cl_lock); - rcu_read_lock(); - list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { - if ((args->cbl_recall_type == RETURN_FSID) && - memcmp(&server->fsid, &args->cbl_fsid, - sizeof(struct nfs_fsid))) - continue; - - list_for_each_entry(lo, &server->layouts, plh_layouts) { - ino = igrab(lo->plh_inode); - if (ino) - continue; - spin_lock(&ino->i_lock); - /* Is this layout in the process of being freed? */ - if (NFS_I(ino)->layout != lo) { - spin_unlock(&ino->i_lock); - iput(ino); - continue; - } - pnfs_get_layout_hdr(lo); - spin_unlock(&ino->i_lock); - list_add(&lo->plh_bulk_recall, &recall_list); - } - } - rcu_read_unlock(); - spin_unlock(&clp->cl_lock); + int stat; - list_for_each_entry_safe(lo, tmp, - &recall_list, plh_bulk_recall) { - ino = lo->plh_inode; - spin_lock(&ino->i_lock); - set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); - if (pnfs_mark_matching_lsegs_invalid(lo, &free_me_list, &range)) - rv = NFS4ERR_DELAY; - list_del_init(&lo->plh_bulk_recall); - spin_unlock(&ino->i_lock); - pnfs_free_lseg_list(&free_me_list); - pnfs_put_layout_hdr(lo); - iput(ino); - } - return rv; + if (args->cbl_recall_type == RETURN_FSID) + stat = pnfs_destroy_layouts_byfsid(clp, &args->cbl_fsid, true); + else + stat = pnfs_destroy_layouts_byclid(clp, true); + if (stat != 0) + return NFS4ERR_DELAY; + return NFS4ERR_NOMATCHING_LAYOUT; } static u32 do_callback_layoutrecall(struct nfs_client *clp, diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 81c5eec3cf38..6390a4b5fee7 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -55,7 +55,8 @@ int nfs4_have_delegation(struct inode *inode, fmode_t flags) flags &= FMODE_READ|FMODE_WRITE; rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); - if (delegation != NULL && (delegation->type & flags) == flags) { + if (delegation != NULL && (delegation->type & flags) == flags && + !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { nfs_mark_delegation_referenced(delegation); ret = 1; } @@ -70,8 +71,10 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_ int status = 0; if (inode->i_flock == NULL) - goto out; + return 0; + if (inode->i_flock == NULL) + goto out; /* Protect inode->i_flock using the file locks lock */ lock_flocks(); for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { @@ -94,7 +97,9 @@ static int nfs_delegation_claim_opens(struct inode *inode, const nfs4_stateid *s { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_open_context *ctx; + struct nfs4_state_owner *sp; struct nfs4_state *state; + unsigned int seq; int err; again: @@ -109,9 +114,16 @@ again: continue; get_nfs_open_context(ctx); spin_unlock(&inode->i_lock); + sp = state->owner; + /* Block nfs4_proc_unlck */ + mutex_lock(&sp->so_delegreturn_mutex); + seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); err = nfs4_open_delegation_recall(ctx, state, stateid); - if (err >= 0) + if (!err) err = nfs_delegation_claim_locks(ctx, state); + if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) + err = -EAGAIN; + mutex_unlock(&sp->so_delegreturn_mutex); put_nfs_open_context(ctx); if (err != 0) return err; @@ -182,39 +194,91 @@ static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation } static struct nfs_delegation * +nfs_start_delegation_return_locked(struct nfs_inode *nfsi) +{ + struct nfs_delegation *ret = NULL; + struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation); + + if (delegation == NULL) + goto out; + spin_lock(&delegation->lock); + if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) + ret = delegation; + spin_unlock(&delegation->lock); +out: + return ret; +} + +static struct nfs_delegation * +nfs_start_delegation_return(struct nfs_inode *nfsi) +{ + struct nfs_delegation *delegation; + + rcu_read_lock(); + delegation = nfs_start_delegation_return_locked(nfsi); + rcu_read_unlock(); + return delegation; +} + +static void +nfs_abort_delegation_return(struct nfs_delegation *delegation, + struct nfs_client *clp) +{ + + spin_lock(&delegation->lock); + clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags); + set_bit(NFS_DELEGATION_RETURN, &delegation->flags); + spin_unlock(&delegation->lock); + set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); +} + +static struct nfs_delegation * nfs_detach_delegation_locked(struct nfs_inode *nfsi, - struct nfs_server *server) + struct nfs_delegation *delegation, + struct nfs_client *clp) { - struct nfs_delegation *delegation = + struct nfs_delegation *deleg_cur = rcu_dereference_protected(nfsi->delegation, - lockdep_is_held(&server->nfs_client->cl_lock)); + lockdep_is_held(&clp->cl_lock)); - if (delegation == NULL) - goto nomatch; + if (deleg_cur == NULL || delegation != deleg_cur) + return NULL; spin_lock(&delegation->lock); + set_bit(NFS_DELEGATION_RETURNING, &delegation->flags); list_del_rcu(&delegation->super_list); delegation->inode = NULL; nfsi->delegation_state = 0; rcu_assign_pointer(nfsi->delegation, NULL); spin_unlock(&delegation->lock); return delegation; -nomatch: - return NULL; } static struct nfs_delegation *nfs_detach_delegation(struct nfs_inode *nfsi, - struct nfs_server *server) + struct nfs_delegation *delegation, + struct nfs_server *server) { struct nfs_client *clp = server->nfs_client; - struct nfs_delegation *delegation; spin_lock(&clp->cl_lock); - delegation = nfs_detach_delegation_locked(nfsi, server); + delegation = nfs_detach_delegation_locked(nfsi, delegation, clp); spin_unlock(&clp->cl_lock); return delegation; } +static struct nfs_delegation * +nfs_inode_detach_delegation(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_delegation *delegation; + + delegation = nfs_start_delegation_return(nfsi); + if (delegation == NULL) + return NULL; + return nfs_detach_delegation(nfsi, delegation, server); +} + /** * nfs_inode_set_delegation - set up a delegation on an inode * @inode: inode to which delegation applies @@ -268,7 +332,10 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct delegation = NULL; goto out; } - freeme = nfs_detach_delegation_locked(nfsi, server); + freeme = nfs_detach_delegation_locked(nfsi, + old_delegation, clp); + if (freeme == NULL) + goto out; } list_add_rcu(&delegation->super_list, &server->delegations); nfsi->delegation_state = delegation->type; @@ -292,19 +359,29 @@ out: /* * Basic procedure for returning a delegation to the server */ -static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync) +static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation *delegation, int issync) { + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; struct nfs_inode *nfsi = NFS_I(inode); int err; - /* - * Guard against new delegated open/lock/unlock calls and against - * state recovery - */ - down_write(&nfsi->rwsem); - err = nfs_delegation_claim_opens(inode, &delegation->stateid); - up_write(&nfsi->rwsem); - if (err) + if (delegation == NULL) + return 0; + do { + err = nfs_delegation_claim_opens(inode, &delegation->stateid); + if (!issync || err != -EAGAIN) + break; + /* + * Guard against state recovery + */ + err = nfs4_wait_clnt_recover(clp); + } while (err == 0); + + if (err) { + nfs_abort_delegation_return(delegation, clp); + goto out; + } + if (!nfs_detach_delegation(nfsi, delegation, NFS_SERVER(inode))) goto out; err = nfs_do_return_delegation(inode, delegation, issync); @@ -340,13 +417,10 @@ restart: inode = nfs_delegation_grab_inode(delegation); if (inode == NULL) continue; - delegation = nfs_detach_delegation(NFS_I(inode), - server); + delegation = nfs_start_delegation_return_locked(NFS_I(inode)); rcu_read_unlock(); - if (delegation != NULL) - err = __nfs_inode_return_delegation(inode, - delegation, 0); + err = nfs_end_delegation_return(inode, delegation, 0); iput(inode); if (!err) goto restart; @@ -367,15 +441,11 @@ restart: */ void nfs_inode_return_delegation_noreclaim(struct inode *inode) { - struct nfs_server *server = NFS_SERVER(inode); - struct nfs_inode *nfsi = NFS_I(inode); struct nfs_delegation *delegation; - if (rcu_access_pointer(nfsi->delegation) != NULL) { - delegation = nfs_detach_delegation(nfsi, server); - if (delegation != NULL) - nfs_do_return_delegation(inode, delegation, 0); - } + delegation = nfs_inode_detach_delegation(inode); + if (delegation != NULL) + nfs_do_return_delegation(inode, delegation, 0); } /** @@ -390,18 +460,14 @@ void nfs_inode_return_delegation_noreclaim(struct inode *inode) */ int nfs4_inode_return_delegation(struct inode *inode) { - struct nfs_server *server = NFS_SERVER(inode); struct nfs_inode *nfsi = NFS_I(inode); struct nfs_delegation *delegation; int err = 0; nfs_wb_all(inode); - if (rcu_access_pointer(nfsi->delegation) != NULL) { - delegation = nfs_detach_delegation(nfsi, server); - if (delegation != NULL) { - err = __nfs_inode_return_delegation(inode, delegation, 1); - } - } + delegation = nfs_start_delegation_return(nfsi); + if (delegation != NULL) + err = nfs_end_delegation_return(inode, delegation, 1); return err; } @@ -471,7 +537,7 @@ void nfs_remove_bad_delegation(struct inode *inode) { struct nfs_delegation *delegation; - delegation = nfs_detach_delegation(NFS_I(inode), NFS_SERVER(inode)); + delegation = nfs_inode_detach_delegation(inode); if (delegation) { nfs_inode_find_state_and_recover(inode, &delegation->stateid); nfs_free_delegation(delegation); @@ -649,7 +715,7 @@ restart: if (inode == NULL) continue; delegation = nfs_detach_delegation(NFS_I(inode), - server); + delegation, server); rcu_read_unlock(); if (delegation != NULL) diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index bbc6a4dba0d8..d54d4fca6793 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -29,6 +29,7 @@ enum { NFS_DELEGATION_NEED_RECLAIM = 0, NFS_DELEGATION_RETURN, NFS_DELEGATION_REFERENCED, + NFS_DELEGATION_RETURNING, }; int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 32e6c53520e2..1b2d7eb93796 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2153,12 +2153,16 @@ static int nfs_open_permission_mask(int openflags) { int mask = 0; - if ((openflags & O_ACCMODE) != O_WRONLY) - mask |= MAY_READ; - if ((openflags & O_ACCMODE) != O_RDONLY) - mask |= MAY_WRITE; - if (openflags & __FMODE_EXEC) - mask |= MAY_EXEC; + if (openflags & __FMODE_EXEC) { + /* ONLY check exec rights */ + mask = MAY_EXEC; + } else { + if ((openflags & O_ACCMODE) != O_WRONLY) + mask |= MAY_READ; + if ((openflags & O_ACCMODE) != O_RDONLY) + mask |= MAY_WRITE; + } + return mask; } diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index 033803c36644..44efaa8c5f78 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -126,8 +126,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh, } spin_unlock(&ret->d_lock); out: - if (name) - kfree(name); + kfree(name); nfs_free_fattr(fsinfo.fattr); return ret; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index ebeb94ce1b0b..6acc73c80d7f 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -694,10 +694,7 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) if (ctx->cred != NULL) put_rpccred(ctx->cred); dput(ctx->dentry); - if (is_sync) - nfs_sb_deactive(sb); - else - nfs_sb_deactive_async(sb); + nfs_sb_deactive(sb); kfree(ctx->mdsthreshold); kfree(ctx); } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index f0e6c7df1a07..541c9ebdbc5a 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -329,7 +329,6 @@ extern int __init register_nfs_fs(void); extern void __exit unregister_nfs_fs(void); extern void nfs_sb_active(struct super_block *sb); extern void nfs_sb_deactive(struct super_block *sb); -extern void nfs_sb_deactive_async(struct super_block *sb); /* namespace.c */ #define NFS_PATH_CANONICAL 1 diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index dd057bc6b65b..fc8dc20fdeb9 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -177,11 +177,31 @@ out_nofree: return mnt; } +static int +nfs_namespace_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + if (NFS_FH(dentry->d_inode)->size != 0) + return nfs_getattr(mnt, dentry, stat); + generic_fillattr(dentry->d_inode, stat); + return 0; +} + +static int +nfs_namespace_setattr(struct dentry *dentry, struct iattr *attr) +{ + if (NFS_FH(dentry->d_inode)->size != 0) + return nfs_setattr(dentry, attr); + return -EACCES; +} + const struct inode_operations nfs_mountpoint_inode_operations = { .getattr = nfs_getattr, + .setattr = nfs_setattr, }; const struct inode_operations nfs_referral_inode_operations = { + .getattr = nfs_namespace_getattr, + .setattr = nfs_namespace_setattr, }; static void nfs_expire_automounts(struct work_struct *work) diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index a3f488b074a2..944c9a5c1039 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -13,6 +13,8 @@ #define NFS4_MAX_LOOP_ON_RECOVER (10) +#include <linux/seqlock.h> + struct idmap; enum nfs4_client_state { @@ -90,6 +92,8 @@ struct nfs4_state_owner { unsigned long so_flags; struct list_head so_states; struct nfs_seqid_counter so_seqid; + seqcount_t so_reclaim_seqcount; + struct mutex so_delegreturn_mutex; }; enum { diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index acc347268124..2e9779b58b7a 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -236,11 +236,10 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, error = nfs4_discover_server_trunking(clp, &old); if (error < 0) goto error; + nfs_put_client(clp); if (clp != old) { clp->cl_preserve_clid = true; - nfs_put_client(clp); clp = old; - atomic_inc(&clp->cl_count); } return clp; @@ -306,7 +305,7 @@ int nfs40_walk_client_list(struct nfs_client *new, .clientid = new->cl_clientid, .confirm = new->cl_confirm, }; - int status; + int status = -NFS4ERR_STALE_CLIENTID; spin_lock(&nn->nfs_client_lock); list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) { @@ -332,40 +331,33 @@ int nfs40_walk_client_list(struct nfs_client *new, if (prev) nfs_put_client(prev); + prev = pos; status = nfs4_proc_setclientid_confirm(pos, &clid, cred); - if (status == 0) { + switch (status) { + case -NFS4ERR_STALE_CLIENTID: + break; + case 0: nfs4_swap_callback_idents(pos, new); - nfs_put_client(pos); + prev = NULL; *result = pos; dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", __func__, pos, atomic_read(&pos->cl_count)); - return 0; - } - if (status != -NFS4ERR_STALE_CLIENTID) { - nfs_put_client(pos); - dprintk("NFS: <-- %s status = %d, no result\n", - __func__, status); - return status; + default: + goto out; } spin_lock(&nn->nfs_client_lock); - prev = pos; } + spin_unlock(&nn->nfs_client_lock); - /* - * No matching nfs_client found. This should be impossible, - * because the new nfs_client has already been added to - * nfs_client_list by nfs_get_client(). - * - * Don't BUG(), since the caller is holding a mutex. - */ + /* No match found. The server lost our clientid */ +out: if (prev) nfs_put_client(prev); - spin_unlock(&nn->nfs_client_lock); - pr_err("NFS: %s Error: no matching nfs_client found\n", __func__); - return -NFS4ERR_STALE_CLIENTID; + dprintk("NFS: <-- %s status = %d\n", __func__, status); + return status; } #ifdef CONFIG_NFS_V4_1 @@ -432,7 +424,7 @@ int nfs41_walk_client_list(struct nfs_client *new, { struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id); struct nfs_client *pos, *n, *prev = NULL; - int error; + int status = -NFS4ERR_STALE_CLIENTID; spin_lock(&nn->nfs_client_lock); list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) { @@ -448,14 +440,17 @@ int nfs41_walk_client_list(struct nfs_client *new, nfs_put_client(prev); prev = pos; - error = nfs_wait_client_init_complete(pos); - if (error < 0) { + nfs4_schedule_lease_recovery(pos); + status = nfs_wait_client_init_complete(pos); + if (status < 0) { nfs_put_client(pos); spin_lock(&nn->nfs_client_lock); continue; } - + status = pos->cl_cons_state; spin_lock(&nn->nfs_client_lock); + if (status < 0) + continue; } if (pos->rpc_ops != new->rpc_ops) @@ -473,6 +468,7 @@ int nfs41_walk_client_list(struct nfs_client *new, if (!nfs4_match_serverowners(pos, new)) continue; + atomic_inc(&pos->cl_count); spin_unlock(&nn->nfs_client_lock); dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", __func__, pos, atomic_read(&pos->cl_count)); @@ -481,16 +477,10 @@ int nfs41_walk_client_list(struct nfs_client *new, return 0; } - /* - * No matching nfs_client found. This should be impossible, - * because the new nfs_client has already been added to - * nfs_client_list by nfs_get_client(). - * - * Don't BUG(), since the caller is holding a mutex. - */ + /* No matching nfs_client found. */ spin_unlock(&nn->nfs_client_lock); - pr_err("NFS: %s Error: no matching nfs_client found\n", __func__); - return -NFS4ERR_STALE_CLIENTID; + dprintk("NFS: <-- %s status = %d\n", __func__, status); + return status; } #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 5d864fb36578..eae83bf96c6d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -896,6 +896,8 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode) return 0; if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags)) return 0; + if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) + return 0; nfs_mark_delegation_referenced(delegation); return 1; } @@ -973,6 +975,7 @@ static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stat spin_lock(&deleg_cur->lock); if (nfsi->delegation != deleg_cur || + test_bit(NFS_DELEGATION_RETURNING, &deleg_cur->flags) || (deleg_cur->type & fmode) != fmode) goto no_delegation_unlock; @@ -1352,19 +1355,18 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state case -NFS4ERR_BAD_HIGH_SLOT: case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: case -NFS4ERR_DEADSESSION: + set_bit(NFS_DELEGATED_STATE, &state->flags); nfs4_schedule_session_recovery(server->nfs_client->cl_session, err); + err = -EAGAIN; goto out; case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: + set_bit(NFS_DELEGATED_STATE, &state->flags); case -NFS4ERR_EXPIRED: /* Don't recall a delegation if it was lost */ nfs4_schedule_lease_recovery(server->nfs_client); + err = -EAGAIN; goto out; - case -ERESTARTSYS: - /* - * The show must go on: exit, but mark the - * stateid as needing recovery. - */ case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: @@ -1375,6 +1377,7 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state err = 0; goto out; } + set_bit(NFS_DELEGATED_STATE, &state->flags); err = nfs4_handle_exception(server, err, &exception); } while (exception.retry); out: @@ -1463,7 +1466,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) struct nfs4_state_owner *sp = data->owner; if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0) - return; + goto out_wait; /* * Check if we still need to send an OPEN call, or if we can use * a delegation instead. @@ -1498,6 +1501,7 @@ unlock_no_action: rcu_read_unlock(); out_no_action: task->tk_action = NULL; +out_wait: nfs4_sequence_done(task, &data->o_res.seq_res); } @@ -1626,7 +1630,8 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data) static int nfs4_opendata_access(struct rpc_cred *cred, struct nfs4_opendata *opendata, - struct nfs4_state *state, fmode_t fmode) + struct nfs4_state *state, fmode_t fmode, + int openflags) { struct nfs_access_entry cache; u32 mask; @@ -1638,11 +1643,14 @@ static int nfs4_opendata_access(struct rpc_cred *cred, mask = 0; /* don't check MAY_WRITE - a newly created file may not have - * write mode bits, but POSIX allows the creating process to write */ - if (fmode & FMODE_READ) - mask |= MAY_READ; - if (fmode & FMODE_EXEC) - mask |= MAY_EXEC; + * write mode bits, but POSIX allows the creating process to write. + * use openflags to check for exec, because fmode won't + * always have FMODE_EXEC set when file open for exec. */ + if (openflags & __FMODE_EXEC) { + /* ONLY check for exec rights */ + mask = MAY_EXEC; + } else if (fmode & FMODE_READ) + mask = MAY_READ; cache.cred = cred; cache.jiffies = jiffies; @@ -1841,6 +1849,43 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct sattr->ia_valid |= ATTR_MTIME; } +static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, + fmode_t fmode, + int flags, + struct nfs4_state **res) +{ + struct nfs4_state_owner *sp = opendata->owner; + struct nfs_server *server = sp->so_server; + struct nfs4_state *state; + unsigned int seq; + int ret; + + seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); + + ret = _nfs4_proc_open(opendata); + if (ret != 0) + goto out; + + state = nfs4_opendata_to_nfs4_state(opendata); + ret = PTR_ERR(state); + if (IS_ERR(state)) + goto out; + if (server->caps & NFS_CAP_POSIX_LOCK) + set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); + + ret = nfs4_opendata_access(sp->so_cred, opendata, state, fmode, flags); + if (ret != 0) + goto out; + + if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) { + nfs4_schedule_stateid_recovery(server, state); + nfs4_wait_clnt_recover(server->nfs_client); + } + *res = state; +out: + return ret; +} + /* * Returns a referenced nfs4_state */ @@ -1885,18 +1930,7 @@ static int _nfs4_do_open(struct inode *dir, if (dentry->d_inode != NULL) opendata->state = nfs4_get_open_state(dentry->d_inode, sp); - status = _nfs4_proc_open(opendata); - if (status != 0) - goto err_opendata_put; - - state = nfs4_opendata_to_nfs4_state(opendata); - status = PTR_ERR(state); - if (IS_ERR(state)) - goto err_opendata_put; - if (server->caps & NFS_CAP_POSIX_LOCK) - set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); - - status = nfs4_opendata_access(cred, opendata, state, fmode); + status = _nfs4_open_and_get_state(opendata, fmode, flags, &state); if (status != 0) goto err_opendata_put; @@ -2084,7 +2118,7 @@ static void nfs4_free_closedata(void *data) nfs4_put_open_state(calldata->state); nfs_free_seqid(calldata->arg.seqid); nfs4_put_state_owner(sp); - nfs_sb_deactive_async(sb); + nfs_sb_deactive(sb); kfree(calldata); } @@ -2146,7 +2180,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) dprintk("%s: begin!\n", __func__); if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) - return; + goto out_wait; task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE]; calldata->arg.fmode = FMODE_READ|FMODE_WRITE; @@ -2168,16 +2202,14 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) if (!call_close) { /* Note: exit _without_ calling nfs4_close_done */ - task->tk_action = NULL; - nfs4_sequence_done(task, &calldata->res.seq_res); - goto out; + goto out_no_action; } if (calldata->arg.fmode == 0) { task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE]; if (calldata->roc && pnfs_roc_drain(inode, &calldata->roc_barrier, task)) - goto out; + goto out_wait; } nfs_fattr_init(calldata->res.fattr); @@ -2187,8 +2219,12 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) &calldata->res.seq_res, task) != 0) nfs_release_seqid(calldata->arg.seqid); -out: dprintk("%s: done!\n", __func__); + return; +out_no_action: + task->tk_action = NULL; +out_wait: + nfs4_sequence_done(task, &calldata->res.seq_res); } static const struct rpc_call_ops nfs4_close_ops = { @@ -4419,12 +4455,10 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data) struct nfs4_unlockdata *calldata = data; if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) - return; + goto out_wait; if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) { /* Note: exit _without_ running nfs4_locku_done */ - task->tk_action = NULL; - nfs4_sequence_done(task, &calldata->res.seq_res); - return; + goto out_no_action; } calldata->timestamp = jiffies; if (nfs4_setup_sequence(calldata->server, @@ -4432,6 +4466,11 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data) &calldata->res.seq_res, task) != 0) nfs_release_seqid(calldata->arg.seqid); + return; +out_no_action: + task->tk_action = NULL; +out_wait: + nfs4_sequence_done(task, &calldata->res.seq_res); } static const struct rpc_call_ops nfs4_locku_ops = { @@ -4478,7 +4517,9 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request) { - struct nfs_inode *nfsi = NFS_I(state->inode); + struct inode *inode = state->inode; + struct nfs4_state_owner *sp = state->owner; + struct nfs_inode *nfsi = NFS_I(inode); struct nfs_seqid *seqid; struct nfs4_lock_state *lsp; struct rpc_task *task; @@ -4488,12 +4529,17 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * status = nfs4_set_lock_state(state, request); /* Unlock _before_ we do the RPC call */ request->fl_flags |= FL_EXISTS; + /* Exclude nfs_delegation_claim_locks() */ + mutex_lock(&sp->so_delegreturn_mutex); + /* Exclude nfs4_reclaim_open_stateid() - note nesting! */ down_read(&nfsi->rwsem); if (do_vfs_lock(request->fl_file, request) == -ENOENT) { up_read(&nfsi->rwsem); + mutex_unlock(&sp->so_delegreturn_mutex); goto out; } up_read(&nfsi->rwsem); + mutex_unlock(&sp->so_delegreturn_mutex); if (status != 0) goto out; /* Is this a delegated lock? */ @@ -4572,7 +4618,7 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) dprintk("%s: begin!\n", __func__); if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0) - return; + goto out_wait; /* Do we need to do an open_to_lock_owner? */ if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) { if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) { @@ -4592,6 +4638,8 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) nfs_release_seqid(data->arg.open_seqid); out_release_lock_seqid: nfs_release_seqid(data->arg.lock_seqid); +out_wait: + nfs4_sequence_done(task, &data->res.seq_res); dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status); } @@ -4809,8 +4857,10 @@ static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *reques static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) { + struct nfs4_state_owner *sp = state->owner; struct nfs_inode *nfsi = NFS_I(state->inode); unsigned char fl_flags = request->fl_flags; + unsigned int seq; int status = -ENOLCK; if ((fl_flags & FL_POSIX) && @@ -4832,9 +4882,16 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock status = do_vfs_lock(request->fl_file, request); goto out_unlock; } + seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); + up_read(&nfsi->rwsem); status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW); if (status != 0) + goto out; + down_read(&nfsi->rwsem); + if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) { + status = -NFS4ERR_DELAY; goto out_unlock; + } /* Note: we always want to sleep here! */ request->fl_flags = fl_flags | FL_SLEEP; if (do_vfs_lock(request->fl_file, request) < 0) @@ -4941,24 +4998,22 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) case 0: case -ESTALE: goto out; - case -NFS4ERR_EXPIRED: - nfs4_schedule_stateid_recovery(server, state); case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: + set_bit(NFS_DELEGATED_STATE, &state->flags); + case -NFS4ERR_EXPIRED: nfs4_schedule_lease_recovery(server->nfs_client); + err = -EAGAIN; goto out; case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: case -NFS4ERR_BAD_HIGH_SLOT: case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: case -NFS4ERR_DEADSESSION: + set_bit(NFS_DELEGATED_STATE, &state->flags); nfs4_schedule_session_recovery(server->nfs_client->cl_session, err); + err = -EAGAIN; goto out; - case -ERESTARTSYS: - /* - * The show must go on: exit, but mark the - * stateid as needing recovery. - */ case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: @@ -4971,9 +5026,8 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) /* kill_proc(fl->fl_pid, SIGLOST, 1); */ err = 0; goto out; - case -NFS4ERR_DELAY: - break; } + set_bit(NFS_DELEGATED_STATE, &state->flags); err = nfs4_handle_exception(server, err, &exception); } while (exception.retry); out: @@ -6130,7 +6184,8 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) status = nfs4_wait_for_completion_rpc_task(task); if (status == 0) status = task->tk_status; - if (status == 0) + /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ + if (status == 0 && lgp->res.layoutp->len) lseg = pnfs_layout_process(lgp); rpc_put_task(task); dprintk("<-- %s status=%d\n", __func__, status); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 9448c579d41a..6ace365c6334 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -136,16 +136,11 @@ int nfs40_discover_server_trunking(struct nfs_client *clp, clp->cl_confirm = clid.confirm; status = nfs40_walk_client_list(clp, result, cred); - switch (status) { - case -NFS4ERR_STALE_CLIENTID: - set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); - case 0: + if (status == 0) { /* Sustain the lease, even if it's empty. If the clientid4 * goes stale it's of no use for trunking discovery. */ nfs4_schedule_state_renewal(*result); - break; } - out: return status; } @@ -523,6 +518,8 @@ nfs4_alloc_state_owner(struct nfs_server *server, nfs4_init_seqid_counter(&sp->so_seqid); atomic_set(&sp->so_count, 1); INIT_LIST_HEAD(&sp->so_lru); + seqcount_init(&sp->so_reclaim_seqcount); + mutex_init(&sp->so_delegreturn_mutex); return sp; } @@ -1395,8 +1392,9 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs * recovering after a network partition or a reboot from a * server that doesn't support a grace period. */ -restart: spin_lock(&sp->so_lock); + write_seqcount_begin(&sp->so_reclaim_seqcount); +restart: list_for_each_entry(state, &sp->so_states, open_states) { if (!test_and_clear_bit(ops->state_flag_bit, &state->flags)) continue; @@ -1417,6 +1415,7 @@ restart: } spin_unlock(&state->state_lock); nfs4_put_open_state(state); + spin_lock(&sp->so_lock); goto restart; } } @@ -1454,12 +1453,17 @@ restart: goto out_err; } nfs4_put_open_state(state); + spin_lock(&sp->so_lock); goto restart; } + write_seqcount_end(&sp->so_reclaim_seqcount); spin_unlock(&sp->so_lock); return 0; out_err: nfs4_put_open_state(state); + spin_lock(&sp->so_lock); + write_seqcount_end(&sp->so_reclaim_seqcount); + spin_unlock(&sp->so_lock); return status; } @@ -1863,6 +1867,7 @@ again: case -ETIMEDOUT: case -EAGAIN: ssleep(1); + case -NFS4ERR_STALE_CLIENTID: dprintk("NFS: %s after status %d, retrying\n", __func__, status); goto again; @@ -2022,8 +2027,18 @@ static int nfs4_reset_session(struct nfs_client *clp) nfs4_begin_drain_session(clp); cred = nfs4_get_exchange_id_cred(clp); status = nfs4_proc_destroy_session(clp->cl_session, cred); - if (status && status != -NFS4ERR_BADSESSION && - status != -NFS4ERR_DEADSESSION) { + switch (status) { + case 0: + case -NFS4ERR_BADSESSION: + case -NFS4ERR_DEADSESSION: + break; + case -NFS4ERR_BACK_CHAN_BUSY: + case -NFS4ERR_DELAY: + set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + status = 0; + ssleep(1); + goto out; + default: status = nfs4_recovery_handle_error(clp, status); goto out; } diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index c6f990656f89..88f9611a945c 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -647,6 +647,7 @@ static struct pnfs_layoutdriver_type objlayout_type = { .flags = PNFS_LAYOUTRET_ON_SETATTR | PNFS_LAYOUTRET_ON_ERROR, + .owner = THIS_MODULE, .alloc_layout_hdr = objlayout_alloc_layout_hdr, .free_layout_hdr = objlayout_free_layout_hdr, diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index e7165d915362..6be70f622b62 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -254,7 +254,7 @@ static void pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit) { lo->plh_retry_timestamp = jiffies; - if (test_and_set_bit(fail_bit, &lo->plh_flags)) + if (!test_and_set_bit(fail_bit, &lo->plh_flags)) atomic_inc(&lo->plh_refcount); } @@ -505,37 +505,147 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) } EXPORT_SYMBOL_GPL(pnfs_destroy_layout); -/* - * Called by the state manger to remove all layouts established under an - * expired lease. - */ -void -pnfs_destroy_all_layouts(struct nfs_client *clp) +static bool +pnfs_layout_add_bulk_destroy_list(struct inode *inode, + struct list_head *layout_list) { - struct nfs_server *server; struct pnfs_layout_hdr *lo; - LIST_HEAD(tmp_list); + bool ret = false; - nfs4_deviceid_mark_client_invalid(clp); - nfs4_deviceid_purge_client(clp); + spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; + if (lo != NULL && list_empty(&lo->plh_bulk_destroy)) { + pnfs_get_layout_hdr(lo); + list_add(&lo->plh_bulk_destroy, layout_list); + ret = true; + } + spin_unlock(&inode->i_lock); + return ret; +} + +/* Caller must hold rcu_read_lock and clp->cl_lock */ +static int +pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp, + struct nfs_server *server, + struct list_head *layout_list) +{ + struct pnfs_layout_hdr *lo, *next; + struct inode *inode; + + list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) { + inode = igrab(lo->plh_inode); + if (inode == NULL) + continue; + list_del_init(&lo->plh_layouts); + if (pnfs_layout_add_bulk_destroy_list(inode, layout_list)) + continue; + rcu_read_unlock(); + spin_unlock(&clp->cl_lock); + iput(inode); + spin_lock(&clp->cl_lock); + rcu_read_lock(); + return -EAGAIN; + } + return 0; +} + +static int +pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, + bool is_bulk_recall) +{ + struct pnfs_layout_hdr *lo; + struct inode *inode; + struct pnfs_layout_range range = { + .iomode = IOMODE_ANY, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + LIST_HEAD(lseg_list); + int ret = 0; + + while (!list_empty(layout_list)) { + lo = list_entry(layout_list->next, struct pnfs_layout_hdr, + plh_bulk_destroy); + dprintk("%s freeing layout for inode %lu\n", __func__, + lo->plh_inode->i_ino); + inode = lo->plh_inode; + spin_lock(&inode->i_lock); + list_del_init(&lo->plh_bulk_destroy); + lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ + if (is_bulk_recall) + set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); + if (pnfs_mark_matching_lsegs_invalid(lo, &lseg_list, &range)) + ret = -EAGAIN; + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&lseg_list); + pnfs_put_layout_hdr(lo); + iput(inode); + } + return ret; +} + +int +pnfs_destroy_layouts_byfsid(struct nfs_client *clp, + struct nfs_fsid *fsid, + bool is_recall) +{ + struct nfs_server *server; + LIST_HEAD(layout_list); spin_lock(&clp->cl_lock); rcu_read_lock(); +restart: list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { - if (!list_empty(&server->layouts)) - list_splice_init(&server->layouts, &tmp_list); + if (memcmp(&server->fsid, fsid, sizeof(*fsid)) != 0) + continue; + if (pnfs_layout_bulk_destroy_byserver_locked(clp, + server, + &layout_list) != 0) + goto restart; } rcu_read_unlock(); spin_unlock(&clp->cl_lock); - while (!list_empty(&tmp_list)) { - lo = list_entry(tmp_list.next, struct pnfs_layout_hdr, - plh_layouts); - dprintk("%s freeing layout for inode %lu\n", __func__, - lo->plh_inode->i_ino); - list_del_init(&lo->plh_layouts); - pnfs_destroy_layout(NFS_I(lo->plh_inode)); + if (list_empty(&layout_list)) + return 0; + return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall); +} + +int +pnfs_destroy_layouts_byclid(struct nfs_client *clp, + bool is_recall) +{ + struct nfs_server *server; + LIST_HEAD(layout_list); + + spin_lock(&clp->cl_lock); + rcu_read_lock(); +restart: + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + if (pnfs_layout_bulk_destroy_byserver_locked(clp, + server, + &layout_list) != 0) + goto restart; } + rcu_read_unlock(); + spin_unlock(&clp->cl_lock); + + if (list_empty(&layout_list)) + return 0; + return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall); +} + +/* + * Called by the state manger to remove all layouts established under an + * expired lease. + */ +void +pnfs_destroy_all_layouts(struct nfs_client *clp) +{ + nfs4_deviceid_mark_client_invalid(clp); + nfs4_deviceid_purge_client(clp); + + pnfs_destroy_layouts_byclid(clp, false); } /* @@ -888,7 +998,7 @@ alloc_init_layout_hdr(struct inode *ino, atomic_set(&lo->plh_refcount, 1); INIT_LIST_HEAD(&lo->plh_layouts); INIT_LIST_HEAD(&lo->plh_segs); - INIT_LIST_HEAD(&lo->plh_bulk_recall); + INIT_LIST_HEAD(&lo->plh_bulk_destroy); lo->plh_inode = ino; lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred); return lo; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index dbf7bba52da0..97cb358bb882 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -132,7 +132,7 @@ struct pnfs_layoutdriver_type { struct pnfs_layout_hdr { atomic_t plh_refcount; struct list_head plh_layouts; /* other client layouts */ - struct list_head plh_bulk_recall; /* clnt list of bulk recalls */ + struct list_head plh_bulk_destroy; struct list_head plh_segs; /* layout segments list */ nfs4_stateid plh_stateid; atomic_t plh_outstanding; /* number of RPCs out */ @@ -196,6 +196,11 @@ struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp); void pnfs_free_lseg_list(struct list_head *tmp_list); void pnfs_destroy_layout(struct nfs_inode *); void pnfs_destroy_all_layouts(struct nfs_client *); +int pnfs_destroy_layouts_byfsid(struct nfs_client *clp, + struct nfs_fsid *fsid, + bool is_recall); +int pnfs_destroy_layouts_byclid(struct nfs_client *clp, + bool is_recall); void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo); void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, diff --git a/fs/nfs/read.c b/fs/nfs/read.c index b6bdb18e892c..a5e5d9899d56 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -91,12 +91,16 @@ void nfs_readdata_release(struct nfs_read_data *rdata) put_nfs_open_context(rdata->args.context); if (rdata->pages.pagevec != rdata->pages.page_array) kfree(rdata->pages.pagevec); - if (rdata != &read_header->rpc_data) - kfree(rdata); - else + if (rdata == &read_header->rpc_data) { rdata->header = NULL; + rdata = NULL; + } if (atomic_dec_and_test(&hdr->refcnt)) hdr->completion_ops->completion(hdr); + /* Note: we only free the rpc_task after callbacks are done. + * See the comment in rpc_free_task() for why + */ + kfree(rdata); } EXPORT_SYMBOL_GPL(nfs_readdata_release); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index c25cadf8f8c4..befbae0cce41 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -54,7 +54,6 @@ #include <linux/parser.h> #include <linux/nsproxy.h> #include <linux/rcupdate.h> -#include <linux/kthread.h> #include <asm/uaccess.h> @@ -418,54 +417,6 @@ void nfs_sb_deactive(struct super_block *sb) } EXPORT_SYMBOL_GPL(nfs_sb_deactive); -static int nfs_deactivate_super_async_work(void *ptr) -{ - struct super_block *sb = ptr; - - deactivate_super(sb); - module_put_and_exit(0); - return 0; -} - -/* - * same effect as deactivate_super, but will do final unmount in kthread - * context - */ -static void nfs_deactivate_super_async(struct super_block *sb) -{ - struct task_struct *task; - char buf[INET6_ADDRSTRLEN + 1]; - struct nfs_server *server = NFS_SB(sb); - struct nfs_client *clp = server->nfs_client; - - if (!atomic_add_unless(&sb->s_active, -1, 1)) { - rcu_read_lock(); - snprintf(buf, sizeof(buf), - rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); - rcu_read_unlock(); - - __module_get(THIS_MODULE); - task = kthread_run(nfs_deactivate_super_async_work, sb, - "%s-deactivate-super", buf); - if (IS_ERR(task)) { - pr_err("%s: kthread_run: %ld\n", - __func__, PTR_ERR(task)); - /* make synchronous call and hope for the best */ - deactivate_super(sb); - module_put(THIS_MODULE); - } - } -} - -void nfs_sb_deactive_async(struct super_block *sb) -{ - struct nfs_server *server = NFS_SB(sb); - - if (atomic_dec_and_test(&server->active)) - nfs_deactivate_super_async(sb); -} -EXPORT_SYMBOL_GPL(nfs_sb_deactive_async); - /* * Deliver file system statistics to userspace */ @@ -1152,7 +1103,7 @@ static int nfs_get_option_str(substring_t args[], char **option) { kfree(*option); *option = match_strdup(args); - return !option; + return !*option; } static int nfs_get_option_ul(substring_t args[], unsigned long *option) @@ -2589,27 +2540,23 @@ nfs_xdev_mount(struct file_system_type *fs_type, int flags, struct nfs_server *server; struct dentry *mntroot = ERR_PTR(-ENOMEM); struct nfs_subversion *nfs_mod = NFS_SB(data->sb)->nfs_client->cl_nfs_mod; - int error; - dprintk("--> nfs_xdev_mount_common()\n"); + dprintk("--> nfs_xdev_mount()\n"); mount_info.mntfh = mount_info.cloned->fh; /* create a new volume representation */ server = nfs_mod->rpc_ops->clone_server(NFS_SB(data->sb), data->fh, data->fattr, data->authflavor); - if (IS_ERR(server)) { - error = PTR_ERR(server); - goto out_err; - } - mntroot = nfs_fs_mount_common(server, flags, dev_name, &mount_info, nfs_mod); - dprintk("<-- nfs_xdev_mount_common() = 0\n"); -out: - return mntroot; + if (IS_ERR(server)) + mntroot = ERR_CAST(server); + else + mntroot = nfs_fs_mount_common(server, flags, + dev_name, &mount_info, nfs_mod); -out_err: - dprintk("<-- nfs_xdev_mount_common() = %d [error]\n", error); - goto out; + dprintk("<-- nfs_xdev_mount() = %ld\n", + IS_ERR(mntroot) ? PTR_ERR(mntroot) : 0L); + return mntroot; } #if IS_ENABLED(CONFIG_NFS_V4) diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 3f79c77153b8..d26a32f5b53b 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -95,7 +95,7 @@ static void nfs_async_unlink_release(void *calldata) nfs_dec_sillycount(data->dir); nfs_free_unlinkdata(data); - nfs_sb_deactive_async(sb); + nfs_sb_deactive(sb); } static void nfs_unlink_prepare(struct rpc_task *task, void *calldata) @@ -268,8 +268,7 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry) * point dentry is definitely not a root, so we won't need * that anymore. */ - if (devname_garbage) - kfree(devname_garbage); + kfree(devname_garbage); return 0; out_unlock: spin_unlock(&dentry->d_lock); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b673be31590e..c483cc50b82e 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -126,12 +126,16 @@ void nfs_writedata_release(struct nfs_write_data *wdata) put_nfs_open_context(wdata->args.context); if (wdata->pages.pagevec != wdata->pages.page_array) kfree(wdata->pages.pagevec); - if (wdata != &write_header->rpc_data) - kfree(wdata); - else + if (wdata == &write_header->rpc_data) { wdata->header = NULL; + wdata = NULL; + } if (atomic_dec_and_test(&hdr->refcnt)) hdr->completion_ops->completion(hdr); + /* Note: we only free the rpc_task after callbacks are done. + * See the comment in rpc_free_task() for why + */ + kfree(wdata); } EXPORT_SYMBOL_GPL(nfs_writedata_release); diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index fdb180769485..f3859354e41a 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -664,8 +664,11 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, if (ret < 0) printk(KERN_ERR "NILFS: GC failed during preparation: " "cannot read source blocks: err=%d\n", ret); - else + else { + if (nilfs_sb_need_update(nilfs)) + set_nilfs_discontinued(nilfs); ret = nilfs_clean_segments(inode->i_sb, argv, kbufs); + } nilfs_remove_all_gcinodes(nilfs); clear_nilfs_gc_running(nilfs); diff --git a/fs/proc/array.c b/fs/proc/array.c index 6a91e6ffbcbd..f7ed9ee46eb9 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -449,7 +449,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, do { min_flt += t->min_flt; maj_flt += t->maj_flt; - gtime += t->gtime; + gtime += task_gtime(t); t = next_thread(t); } while (t != task); @@ -472,7 +472,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, min_flt = task->min_flt; maj_flt = task->maj_flt; task_cputime_adjusted(task, &utime, &stime); - gtime = task->gtime; + gtime = task_gtime(task); } /* scale priority and nice values from timeslices to -20..20 */ diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index fe72cd073dea..3131a03d7d37 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -177,20 +177,6 @@ const struct file_operations proc_net_operations = { .readdir = proc_tgid_net_readdir, }; - -struct proc_dir_entry *proc_net_fops_create(struct net *net, - const char *name, umode_t mode, const struct file_operations *fops) -{ - return proc_create(name, mode, net->proc_net, fops); -} -EXPORT_SYMBOL_GPL(proc_net_fops_create); - -void proc_net_remove(struct net *net, const char *name) -{ - remove_proc_entry(name, net->proc_net); -} -EXPORT_SYMBOL_GPL(proc_net_remove); - static __net_init int proc_net_ns_init(struct net *net) { struct proc_dir_entry *netd, *net_statd; diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index f883e7e74305..288f068740f6 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -167,12 +167,16 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz) { char *hdr; - struct timeval timestamp; + struct timespec timestamp; size_t len; - do_gettimeofday(×tamp); + /* Report zeroed timestamp if called before timekeeping has resumed. */ + if (__getnstimeofday(×tamp)) { + timestamp.tv_sec = 0; + timestamp.tv_nsec = 0; + } hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR "%lu.%lu\n", - (long)timestamp.tv_sec, (long)timestamp.tv_usec); + (long)timestamp.tv_sec, (long)(timestamp.tv_nsec / 1000)); WARN_ON_ONCE(!hdr); len = hdr ? strlen(hdr) : 0; persistent_ram_write(prz, hdr, len); @@ -291,9 +295,8 @@ static void ramoops_free_przs(struct ramoops_context *cxt) kfree(cxt->przs); } -static int __devinit ramoops_init_przs(struct device *dev, - struct ramoops_context *cxt, - phys_addr_t *paddr, size_t dump_mem_sz) +static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt, + phys_addr_t *paddr, size_t dump_mem_sz) { int err = -ENOMEM; int i; @@ -336,10 +339,9 @@ fail_prz: return err; } -static int __devinit ramoops_init_prz(struct device *dev, - struct ramoops_context *cxt, - struct persistent_ram_zone **prz, - phys_addr_t *paddr, size_t sz, u32 sig) +static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt, + struct persistent_ram_zone **prz, + phys_addr_t *paddr, size_t sz, u32 sig) { if (!sz) return 0; @@ -367,7 +369,7 @@ static int __devinit ramoops_init_prz(struct device *dev, return 0; } -static int __devinit ramoops_probe(struct platform_device *pdev) +static int ramoops_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct ramoops_platform_data *pdata = pdev->dev.platform_data; diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index eecd2a8a84dd..0306303be372 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -390,8 +390,8 @@ static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size, return 0; } -static int __devinit persistent_ram_post_init(struct persistent_ram_zone *prz, - u32 sig, int ecc_size) +static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig, + int ecc_size) { int ret; @@ -443,9 +443,8 @@ void persistent_ram_free(struct persistent_ram_zone *prz) kfree(prz); } -struct persistent_ram_zone * __devinit persistent_ram_new(phys_addr_t start, - size_t size, u32 sig, - int ecc_size) +struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, + u32 sig, int ecc_size) { struct persistent_ram_zone *prz; int ret = -ENOMEM; diff --git a/fs/select.c b/fs/select.c index 2ef72d965036..8c1c96c27062 100644 --- a/fs/select.c +++ b/fs/select.c @@ -26,6 +26,7 @@ #include <linux/fs.h> #include <linux/rcupdate.h> #include <linux/hrtimer.h> +#include <linux/sched/rt.h> #include <asm/uaccess.h> diff --git a/fs/seq_file.c b/fs/seq_file.c index 9d863fb501f9..f2bc3dfd0b88 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -296,7 +296,7 @@ EXPORT_SYMBOL(seq_read); * seq_lseek - ->llseek() method for sequential files. * @file: the file in question * @offset: new position - * @origin: 0 for absolute, 1 for relative position + * @whence: 0 for absolute, 1 for relative position * * Ready-made ->f_op->llseek() */ diff --git a/fs/splice.c b/fs/splice.c index 8890604e3fcd..6909d89d0da5 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -696,8 +696,10 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe, return -EINVAL; more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0; - if (sd->len < sd->total_len) + + if (sd->len < sd->total_len && pipe->nrbufs > 1) more |= MSG_SENDPAGE_NOTLAST; + return file->f_op->sendpage(file, buf->page, buf->offset, sd->len, &pos, more); } diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 2df555c66d57..aec3d5c98c94 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -205,6 +205,48 @@ void sysfs_unmerge_group(struct kobject *kobj, } EXPORT_SYMBOL_GPL(sysfs_unmerge_group); +/** + * sysfs_add_link_to_group - add a symlink to an attribute group. + * @kobj: The kobject containing the group. + * @group_name: The name of the group. + * @target: The target kobject of the symlink to create. + * @link_name: The name of the symlink to create. + */ +int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, + struct kobject *target, const char *link_name) +{ + struct sysfs_dirent *dir_sd; + int error = 0; + + dir_sd = sysfs_get_dirent(kobj->sd, NULL, group_name); + if (!dir_sd) + return -ENOENT; + + error = sysfs_create_link_sd(dir_sd, target, link_name); + sysfs_put(dir_sd); + + return error; +} +EXPORT_SYMBOL_GPL(sysfs_add_link_to_group); + +/** + * sysfs_remove_link_from_group - remove a symlink from an attribute group. + * @kobj: The kobject containing the group. + * @group_name: The name of the group. + * @link_name: The name of the symlink to remove. + */ +void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, + const char *link_name) +{ + struct sysfs_dirent *dir_sd; + + dir_sd = sysfs_get_dirent(kobj->sd, NULL, group_name); + if (dir_sd) { + sysfs_hash_and_remove(dir_sd, NULL, link_name); + sysfs_put(dir_sd); + } +} +EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group); EXPORT_SYMBOL_GPL(sysfs_create_group); EXPORT_SYMBOL_GPL(sysfs_update_group); diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 3c9eb5624f5e..8c940df97a52 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -21,26 +21,17 @@ #include "sysfs.h" -static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, - const char *name, int warn) +static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd, + struct kobject *target, + const char *name, int warn) { - struct sysfs_dirent *parent_sd = NULL; struct sysfs_dirent *target_sd = NULL; struct sysfs_dirent *sd = NULL; struct sysfs_addrm_cxt acxt; enum kobj_ns_type ns_type; int error; - BUG_ON(!name); - - if (!kobj) - parent_sd = &sysfs_root; - else - parent_sd = kobj->sd; - - error = -EFAULT; - if (!parent_sd) - goto out_put; + BUG_ON(!name || !parent_sd); /* target->sd can go away beneath us but is protected with * sysfs_assoc_lock. Fetch target_sd from it. @@ -96,6 +87,34 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, } /** + * sysfs_create_link_sd - create symlink to a given object. + * @sd: directory we're creating the link in. + * @target: object we're pointing to. + * @name: name of the symlink. + */ +int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target, + const char *name) +{ + return sysfs_do_create_link_sd(sd, target, name, 1); +} + +static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, + const char *name, int warn) +{ + struct sysfs_dirent *parent_sd = NULL; + + if (!kobj) + parent_sd = &sysfs_root; + else + parent_sd = kobj->sd; + + if (!parent_sd) + return -EFAULT; + + return sysfs_do_create_link_sd(parent_sd, target, name, warn); +} + +/** * sysfs_create_link - create symlink between two objects. * @kobj: object whose directory we're creating the link in. * @target: object we're pointing to. diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index d73c0932bbd6..d1e4043eb0c3 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -240,3 +240,5 @@ void unmap_bin_file(struct sysfs_dirent *attr_sd); * symlink.c */ extern const struct inode_operations sysfs_symlink_inode_operations; +int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target, + const char *name); diff --git a/fs/udf/super.c b/fs/udf/super.c index d44fb568abe1..e9be396a558d 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -307,7 +307,8 @@ static void udf_sb_free_partitions(struct super_block *sb) { struct udf_sb_info *sbi = UDF_SB(sb); int i; - + if (sbi->s_partmaps == NULL) + return; for (i = 0; i < sbi->s_partitions; i++) udf_free_partition(&sbi->s_partmaps[i]); kfree(sbi->s_partmaps); diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 393055fe3aef..0ad23253e8b1 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -1925,8 +1925,6 @@ xfs_alloc_fix_freelist( targs.mp = mp; targs.agbp = agbp; targs.agno = args->agno; - targs.mod = targs.minleft = targs.wasdel = targs.userdata = - targs.minalignslop = 0; targs.alignment = targs.minlen = targs.prod = targs.isfl = 1; targs.type = XFS_ALLOCTYPE_THIS_AG; targs.pag = pag; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 4111a40ebe1a..5f707e537171 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -86,11 +86,11 @@ xfs_destroy_ioend( } if (ioend->io_iocb) { + inode_dio_done(ioend->io_inode); if (ioend->io_isasync) { aio_complete(ioend->io_iocb, ioend->io_error ? ioend->io_error : ioend->io_result, 0); } - inode_dio_done(ioend->io_inode); } mempool_free(ioend, xfs_ioend_pool); diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index aaf472532b3c..888683844d98 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -300,9 +300,12 @@ xfs_attr_set_int( if (rsvd) args.trans->t_flags |= XFS_TRANS_RESERVE; - if ((error = xfs_trans_reserve(args.trans, args.total, - XFS_ATTRSET_LOG_RES(mp, args.total), 0, - XFS_TRANS_PERM_LOG_RES, XFS_ATTRSET_LOG_COUNT))) { + error = xfs_trans_reserve(args.trans, args.total, + XFS_ATTRSETM_LOG_RES(mp) + + XFS_ATTRSETRT_LOG_RES(mp) * args.total, + 0, XFS_TRANS_PERM_LOG_RES, + XFS_ATTRSET_LOG_COUNT); + if (error) { xfs_trans_cancel(args.trans, 0); return(error); } diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 0e92d12765d2..b44af9211bd9 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -147,7 +147,10 @@ xfs_bmap_local_to_extents( xfs_fsblock_t *firstblock, /* first block allocated in xaction */ xfs_extlen_t total, /* total blocks needed by transaction */ int *logflagsp, /* inode logging flags */ - int whichfork); /* data or attr fork */ + int whichfork, /* data or attr fork */ + void (*init_fn)(struct xfs_buf *bp, + struct xfs_inode *ip, + struct xfs_ifork *ifp)); /* * Search the extents list for the inode, for the extent containing bno. @@ -357,7 +360,42 @@ xfs_bmap_add_attrfork_extents( } /* - * Called from xfs_bmap_add_attrfork to handle local format files. + * Block initialisation functions for local to extent format conversion. + * As these get more complex, they will be moved to the relevant files, + * but for now they are too simple to worry about. + */ +STATIC void +xfs_bmap_local_to_extents_init_fn( + struct xfs_buf *bp, + struct xfs_inode *ip, + struct xfs_ifork *ifp) +{ + bp->b_ops = &xfs_bmbt_buf_ops; + memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); +} + +STATIC void +xfs_symlink_local_to_remote( + struct xfs_buf *bp, + struct xfs_inode *ip, + struct xfs_ifork *ifp) +{ + /* remote symlink blocks are not verifiable until CRCs come along */ + bp->b_ops = NULL; + memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); +} + +/* + * Called from xfs_bmap_add_attrfork to handle local format files. Each + * different data fork content type needs a different callout to do the + * conversion. Some are basic and only require special block initialisation + * callouts for the data formating, others (directories) are so specialised they + * handle everything themselves. + * + * XXX (dgc): investigate whether directory conversion can use the generic + * formatting callout. It should be possible - it's just a very complex + * formatter. it would also require passing the transaction through to the init + * function. */ STATIC int /* error */ xfs_bmap_add_attrfork_local( @@ -368,25 +406,29 @@ xfs_bmap_add_attrfork_local( int *flags) /* inode logging flags */ { xfs_da_args_t dargs; /* args for dir/attr code */ - int error; /* error return value */ - xfs_mount_t *mp; /* mount structure pointer */ if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip)) return 0; + if (S_ISDIR(ip->i_d.di_mode)) { - mp = ip->i_mount; memset(&dargs, 0, sizeof(dargs)); dargs.dp = ip; dargs.firstblock = firstblock; dargs.flist = flist; - dargs.total = mp->m_dirblkfsbs; + dargs.total = ip->i_mount->m_dirblkfsbs; dargs.whichfork = XFS_DATA_FORK; dargs.trans = tp; - error = xfs_dir2_sf_to_block(&dargs); - } else - error = xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags, - XFS_DATA_FORK); - return error; + return xfs_dir2_sf_to_block(&dargs); + } + + if (S_ISLNK(ip->i_d.di_mode)) + return xfs_bmap_local_to_extents(tp, ip, firstblock, 1, + flags, XFS_DATA_FORK, + xfs_symlink_local_to_remote); + + return xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags, + XFS_DATA_FORK, + xfs_bmap_local_to_extents_init_fn); } /* @@ -3099,8 +3141,6 @@ xfs_bmap_extents_to_btree( args.fsbno = *firstblock; } args.minlen = args.maxlen = args.prod = 1; - args.total = args.minleft = args.alignment = args.mod = args.isfl = - args.minalignslop = 0; args.wasdel = wasdel; *logflagsp = 0; if ((error = xfs_alloc_vextent(&args))) { @@ -3221,7 +3261,10 @@ xfs_bmap_local_to_extents( xfs_fsblock_t *firstblock, /* first block allocated in xaction */ xfs_extlen_t total, /* total blocks needed by transaction */ int *logflagsp, /* inode logging flags */ - int whichfork) /* data or attr fork */ + int whichfork, + void (*init_fn)(struct xfs_buf *bp, + struct xfs_inode *ip, + struct xfs_ifork *ifp)) { int error; /* error return value */ int flags; /* logging flags returned */ @@ -3241,12 +3284,12 @@ xfs_bmap_local_to_extents( xfs_buf_t *bp; /* buffer for extent block */ xfs_bmbt_rec_host_t *ep;/* extent record pointer */ + ASSERT((ifp->if_flags & + (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE); memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = ip->i_mount; args.firstblock = *firstblock; - ASSERT((ifp->if_flags & - (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE); /* * Allocate a block. We know we need only one, since the * file currently fits in an inode. @@ -3259,20 +3302,21 @@ xfs_bmap_local_to_extents( args.type = XFS_ALLOCTYPE_NEAR_BNO; } args.total = total; - args.mod = args.minleft = args.alignment = args.wasdel = - args.isfl = args.minalignslop = 0; args.minlen = args.maxlen = args.prod = 1; - if ((error = xfs_alloc_vextent(&args))) + error = xfs_alloc_vextent(&args); + if (error) goto done; - /* - * Can't fail, the space was reserved. - */ + + /* Can't fail, the space was reserved. */ ASSERT(args.fsbno != NULLFSBLOCK); ASSERT(args.len == 1); *firstblock = args.fsbno; bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); - bp->b_ops = &xfs_bmbt_buf_ops; - memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); + + /* initialise the block and copy the data */ + init_fn(bp, ip, ifp); + + /* account for the change in fork size and log everything */ xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); xfs_bmap_forkoff_reset(args.mp, ip, whichfork); xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); @@ -4680,9 +4724,6 @@ __xfs_bmapi_allocate( return error; } - if (bma->flags & XFS_BMAPI_STACK_SWITCH) - bma->stack_switch = 1; - error = xfs_bmap_alloc(bma); if (error) return error; @@ -4922,8 +4963,32 @@ xfs_bmapi_write( XFS_STATS_INC(xs_blk_mapw); if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + /* + * XXX (dgc): This assumes we are only called for inodes that + * contain content neutral data in local format. Anything that + * contains caller-specific data in local format that needs + * transformation to move to a block format needs to do the + * conversion to extent format itself. + * + * Directory data forks and attribute forks handle this + * themselves, but with the addition of metadata verifiers every + * data fork in local format now contains caller specific data + * and as such conversion through this function is likely to be + * broken. + * + * The only likely user of this branch is for remote symlinks, + * but we cannot overwrite the data fork contents of the symlink + * (EEXIST occurs higher up the stack) and so it will never go + * from local format to extent format here. Hence I don't think + * this branch is ever executed intentionally and we should + * consider removing it and asserting that xfs_bmapi_write() + * cannot be called directly on local format forks. i.e. callers + * are completely responsible for local to extent format + * conversion, not xfs_bmapi_write(). + */ error = xfs_bmap_local_to_extents(tp, ip, firstblock, total, - &bma.logflags, whichfork); + &bma.logflags, whichfork, + xfs_bmap_local_to_extents_init_fn); if (error) goto error0; } @@ -4956,6 +5021,9 @@ xfs_bmapi_write( bma.flist = flist; bma.firstblock = firstblock; + if (flags & XFS_BMAPI_STACK_SWITCH) + bma.stack_switch = 1; + while (bno < end && n < *nmap) { inhole = eof || bma.got.br_startoff > bno; wasdelay = !inhole && isnullstartblock(bma.got.br_startblock); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 26673a0b20e7..4e8f0df82d02 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -175,7 +175,7 @@ xfs_buf_get_maps( bp->b_map_count = map_count; if (map_count == 1) { - bp->b_maps = &bp->b_map; + bp->b_maps = &bp->__b_map; return 0; } @@ -193,7 +193,7 @@ static void xfs_buf_free_maps( struct xfs_buf *bp) { - if (bp->b_maps != &bp->b_map) { + if (bp->b_maps != &bp->__b_map) { kmem_free(bp->b_maps); bp->b_maps = NULL; } @@ -377,8 +377,8 @@ xfs_buf_allocate_memory( } use_alloc_page: - start = BBTOB(bp->b_map.bm_bn) >> PAGE_SHIFT; - end = (BBTOB(bp->b_map.bm_bn + bp->b_length) + PAGE_SIZE - 1) + start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT; + end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1) >> PAGE_SHIFT; page_count = end - start; error = _xfs_buf_get_pages(bp, page_count, flags); @@ -487,6 +487,7 @@ _xfs_buf_find( struct rb_node *parent; xfs_buf_t *bp; xfs_daddr_t blkno = map[0].bm_bn; + xfs_daddr_t eofs; int numblks = 0; int i; @@ -498,6 +499,23 @@ _xfs_buf_find( ASSERT(!(numbytes < (1 << btp->bt_sshift))); ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask)); + /* + * Corrupted block numbers can get through to here, unfortunately, so we + * have to check that the buffer falls within the filesystem bounds. + */ + eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); + if (blkno >= eofs) { + /* + * XXX (dgc): we should really be returning EFSCORRUPTED here, + * but none of the higher level infrastructure supports + * returning a specific error on buffer lookup failures. + */ + xfs_alert(btp->bt_mount, + "%s: Block out of range: block 0x%llx, EOFS 0x%llx ", + __func__, blkno, eofs); + return NULL; + } + /* get tree root */ pag = xfs_perag_get(btp->bt_mount, xfs_daddr_to_agno(btp->bt_mount, blkno)); @@ -640,7 +658,7 @@ _xfs_buf_read( xfs_buf_flags_t flags) { ASSERT(!(flags & XBF_WRITE)); - ASSERT(bp->b_map.bm_bn != XFS_BUF_DADDR_NULL); + ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL); bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD); bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); @@ -933,8 +951,6 @@ xfs_buf_trylock( locked = down_trylock(&bp->b_sema) == 0; if (locked) XB_SET_OWNER(bp); - else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) - xfs_log_force(bp->b_target->bt_mount, 0); trace_xfs_buf_trylock(bp, _RET_IP_); return locked; @@ -1487,6 +1503,8 @@ restart: while (!list_empty(&btp->bt_lru)) { bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru); if (atomic_read(&bp->b_hold) > 1) { + trace_xfs_buf_wait_buftarg(bp, _RET_IP_); + list_move_tail(&bp->b_lru, &btp->bt_lru); spin_unlock(&btp->bt_lru_lock); delay(100); goto restart; @@ -1709,7 +1727,7 @@ xfs_buf_cmp( struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); xfs_daddr_t diff; - diff = ap->b_map.bm_bn - bp->b_map.bm_bn; + diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn; if (diff < 0) return -1; if (diff > 0) diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 23f5642480bb..433a12ed7b17 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -151,7 +151,7 @@ typedef struct xfs_buf { struct page **b_pages; /* array of page pointers */ struct page *b_page_array[XB_PAGES]; /* inline pages */ struct xfs_buf_map *b_maps; /* compound buffer map */ - struct xfs_buf_map b_map; /* inline compound buffer map */ + struct xfs_buf_map __b_map; /* inline compound buffer map */ int b_map_count; int b_io_length; /* IO size in BBs */ atomic_t b_pin_count; /* pin count */ @@ -330,8 +330,8 @@ void xfs_buf_stale(struct xfs_buf *bp); * In future, uncached buffers will pass the block number directly to the io * request function and hence these macros will go away at that point. */ -#define XFS_BUF_ADDR(bp) ((bp)->b_map.bm_bn) -#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_map.bm_bn = (xfs_daddr_t)(bno)) +#define XFS_BUF_ADDR(bp) ((bp)->b_maps[0].bm_bn) +#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_maps[0].bm_bn = (xfs_daddr_t)(bno)) static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) { diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index becf4a97efc6..cf263476d6b4 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -37,109 +37,6 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) return container_of(lip, struct xfs_buf_log_item, bli_item); } - -#ifdef XFS_TRANS_DEBUG -/* - * This function uses an alternate strategy for tracking the bytes - * that the user requests to be logged. This can then be used - * in conjunction with the bli_orig array in the buf log item to - * catch bugs in our callers' code. - * - * We also double check the bits set in xfs_buf_item_log using a - * simple algorithm to check that every byte is accounted for. - */ -STATIC void -xfs_buf_item_log_debug( - xfs_buf_log_item_t *bip, - uint first, - uint last) -{ - uint x; - uint byte; - uint nbytes; - uint chunk_num; - uint word_num; - uint bit_num; - uint bit_set; - uint *wordp; - - ASSERT(bip->bli_logged != NULL); - byte = first; - nbytes = last - first + 1; - bfset(bip->bli_logged, first, nbytes); - for (x = 0; x < nbytes; x++) { - chunk_num = byte >> XFS_BLF_SHIFT; - word_num = chunk_num >> BIT_TO_WORD_SHIFT; - bit_num = chunk_num & (NBWORD - 1); - wordp = &(bip->bli_format.blf_data_map[word_num]); - bit_set = *wordp & (1 << bit_num); - ASSERT(bit_set); - byte++; - } -} - -/* - * This function is called when we flush something into a buffer without - * logging it. This happens for things like inodes which are logged - * separately from the buffer. - */ -void -xfs_buf_item_flush_log_debug( - xfs_buf_t *bp, - uint first, - uint last) -{ - xfs_buf_log_item_t *bip = bp->b_fspriv; - uint nbytes; - - if (bip == NULL || (bip->bli_item.li_type != XFS_LI_BUF)) - return; - - ASSERT(bip->bli_logged != NULL); - nbytes = last - first + 1; - bfset(bip->bli_logged, first, nbytes); -} - -/* - * This function is called to verify that our callers have logged - * all the bytes that they changed. - * - * It does this by comparing the original copy of the buffer stored in - * the buf log item's bli_orig array to the current copy of the buffer - * and ensuring that all bytes which mismatch are set in the bli_logged - * array of the buf log item. - */ -STATIC void -xfs_buf_item_log_check( - xfs_buf_log_item_t *bip) -{ - char *orig; - char *buffer; - int x; - xfs_buf_t *bp; - - ASSERT(bip->bli_orig != NULL); - ASSERT(bip->bli_logged != NULL); - - bp = bip->bli_buf; - ASSERT(bp->b_length > 0); - ASSERT(bp->b_addr != NULL); - orig = bip->bli_orig; - buffer = bp->b_addr; - for (x = 0; x < BBTOB(bp->b_length); x++) { - if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) { - xfs_emerg(bp->b_mount, - "%s: bip %x buffer %x orig %x index %d", - __func__, bip, bp, orig, x); - ASSERT(0); - } - } -} -#else -#define xfs_buf_item_log_debug(x,y,z) -#define xfs_buf_item_log_check(x) -#endif - STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); /* @@ -237,7 +134,7 @@ xfs_buf_item_size( * cancel flag in it. */ trace_xfs_buf_item_size_stale(bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); + ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); return bip->bli_format_count; } @@ -278,7 +175,7 @@ xfs_buf_item_format_segment( uint buffer_offset; /* copy the flags across from the base format item */ - blfp->blf_flags = bip->bli_format.blf_flags; + blfp->blf_flags = bip->__bli_format.blf_flags; /* * Base size is the actual size of the ondisk structure - it reflects @@ -287,6 +184,17 @@ xfs_buf_item_format_segment( */ base_size = offsetof(struct xfs_buf_log_format, blf_data_map) + (blfp->blf_map_size * sizeof(blfp->blf_data_map[0])); + + nvecs = 0; + first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); + if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) { + /* + * If the map is not be dirty in the transaction, mark + * the size as zero and do not advance the vector pointer. + */ + goto out; + } + vecp->i_addr = blfp; vecp->i_len = base_size; vecp->i_type = XLOG_REG_TYPE_BFORMAT; @@ -301,15 +209,13 @@ xfs_buf_item_format_segment( */ trace_xfs_buf_item_format_stale(bip); ASSERT(blfp->blf_flags & XFS_BLF_CANCEL); - blfp->blf_size = nvecs; - return vecp; + goto out; } /* * Fill in an iovec for each set of contiguous chunks. */ - first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); - ASSERT(first_bit != -1); + last_bit = first_bit; nbits = 1; for (;;) { @@ -371,7 +277,8 @@ xfs_buf_item_format_segment( nbits++; } } - bip->bli_format.blf_size = nvecs; +out: + blfp->blf_size = nvecs; return vecp; } @@ -405,7 +312,7 @@ xfs_buf_item_format( if (bip->bli_flags & XFS_BLI_INODE_BUF) { if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && xfs_log_item_in_current_chkpt(lip))) - bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF; + bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF; bip->bli_flags &= ~XFS_BLI_INODE_BUF; } @@ -419,7 +326,6 @@ xfs_buf_item_format( * Check to make sure everything is consistent. */ trace_xfs_buf_item_format(bip); - xfs_buf_item_log_check(bip); } /* @@ -485,7 +391,7 @@ xfs_buf_item_unpin( ASSERT(bip->bli_flags & XFS_BLI_STALE); ASSERT(xfs_buf_islocked(bp)); ASSERT(XFS_BUF_ISSTALE(bp)); - ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); + ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); trace_xfs_buf_item_unpin_stale(bip); @@ -563,8 +469,18 @@ xfs_buf_item_push( if (xfs_buf_ispinned(bp)) return XFS_ITEM_PINNED; - if (!xfs_buf_trylock(bp)) + if (!xfs_buf_trylock(bp)) { + /* + * If we have just raced with a buffer being pinned and it has + * been marked stale, we could end up stalling until someone else + * issues a log force to unpin the stale buffer. Check for the + * race condition here so xfsaild recognizes the buffer is pinned + * and queues a log force to move it along. + */ + if (xfs_buf_ispinned(bp)) + return XFS_ITEM_PINNED; return XFS_ITEM_LOCKED; + } ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); @@ -601,7 +517,7 @@ xfs_buf_item_unlock( { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; - int aborted; + int aborted, clean, i; uint hold; /* Clear the buffer's association with this transaction. */ @@ -631,7 +547,7 @@ xfs_buf_item_unlock( */ if (bip->bli_flags & XFS_BLI_STALE) { trace_xfs_buf_item_unlock_stale(bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); + ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); if (!aborted) { atomic_dec(&bip->bli_refcount); return; @@ -642,12 +558,27 @@ xfs_buf_item_unlock( /* * If the buf item isn't tracking any data, free it, otherwise drop the - * reference we hold to it. + * reference we hold to it. If we are aborting the transaction, this may + * be the only reference to the buf item, so we free it anyway + * regardless of whether it is dirty or not. A dirty abort implies a + * shutdown, anyway. */ - if (xfs_bitmap_empty(bip->bli_format.blf_data_map, - bip->bli_format.blf_map_size)) + clean = 1; + for (i = 0; i < bip->bli_format_count; i++) { + if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, + bip->bli_formats[i].blf_map_size)) { + clean = 0; + break; + } + } + if (clean) xfs_buf_item_relse(bp); - else + else if (aborted) { + if (atomic_dec_and_test(&bip->bli_refcount)) { + ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); + xfs_buf_item_relse(bp); + } + } else atomic_dec(&bip->bli_refcount); if (!hold) @@ -716,7 +647,7 @@ xfs_buf_item_get_format( bip->bli_format_count = count; if (count == 1) { - bip->bli_formats = &bip->bli_format; + bip->bli_formats = &bip->__bli_format; return 0; } @@ -731,7 +662,7 @@ STATIC void xfs_buf_item_free_format( struct xfs_buf_log_item *bip) { - if (bip->bli_formats != &bip->bli_format) { + if (bip->bli_formats != &bip->__bli_format) { kmem_free(bip->bli_formats); bip->bli_formats = NULL; } @@ -898,8 +829,6 @@ xfs_buf_item_log_segment( mask = (1 << end_bit) - 1; *wordp |= mask; } - - xfs_buf_item_log_debug(bip, first, last); } /* diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 6850f49f4af3..ee36c88ecfde 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -98,13 +98,9 @@ typedef struct xfs_buf_log_item { unsigned int bli_flags; /* misc flags */ unsigned int bli_recur; /* lock recursion count */ atomic_t bli_refcount; /* cnt of tp refs */ -#ifdef XFS_TRANS_DEBUG - char *bli_orig; /* original buffer copy */ - char *bli_logged; /* bytes logged (bitmap) */ -#endif int bli_format_count; /* count of headers */ struct xfs_buf_log_format *bli_formats; /* array of in-log header ptrs */ - struct xfs_buf_log_format bli_format; /* embedded in-log header */ + struct xfs_buf_log_format __bli_format; /* embedded in-log header */ } xfs_buf_log_item_t; void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); @@ -117,16 +113,6 @@ void xfs_buf_attach_iodone(struct xfs_buf *, void xfs_buf_iodone_callbacks(struct xfs_buf *); void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); -#ifdef XFS_TRANS_DEBUG -void -xfs_buf_item_flush_log_debug( - struct xfs_buf *bp, - uint first, - uint last); -#else -#define xfs_buf_item_flush_log_debug(bp, first, last) -#endif - #endif /* __KERNEL__ */ #endif /* __XFS_BUF_ITEM_H__ */ diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index d0e9c74d3d96..a8bd26b82ecb 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -246,10 +246,10 @@ xfs_swap_extents( goto out_unlock; } - error = -filemap_write_and_wait(VFS_I(ip)->i_mapping); + error = -filemap_write_and_wait(VFS_I(tip)->i_mapping); if (error) goto out_unlock; - truncate_pagecache_range(VFS_I(ip), 0, -1); + truncate_pagecache_range(VFS_I(tip), 0, -1); /* Verify O_DIRECT for ftmp */ if (VN_CACHED(VFS_I(tip)) != 0) { diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 7536faaa61e7..12afe07a91d7 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -355,10 +355,12 @@ xfs_dir2_block_addname( /* * If need to compact the leaf entries, do it now. */ - if (compact) + if (compact) { xfs_dir2_block_compact(tp, bp, hdr, btp, blp, &needlog, &lfloghigh, &lfloglow); - else if (btp->stale) { + /* recalculate blp post-compaction */ + blp = xfs_dir2_block_leaf_p(btp); + } else if (btp->stale) { /* * Set leaf logging boundaries to impossible state. * For the no-stale case they're set explicitly. diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 9e1bf5294c91..8025eb23ad72 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -612,15 +612,9 @@ xfs_qm_dqread( if (flags & XFS_QMOPT_DQALLOC) { tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp), - XFS_WRITE_LOG_RES(mp) + - /* - * Round the chunklen up to the next multiple - * of 128 (buf log item chunk size)). - */ - BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 + 128, - 0, - XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT); + XFS_QM_DQALLOC_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, + XFS_WRITE_LOG_COUNT); if (error) goto error1; cancelflags = XFS_TRANS_RELEASE_LOG_RES; diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 94eaeedc5498..2866b8c78b7a 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -709,8 +709,8 @@ xfs_fs_log_dummy( int error; tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP); - error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, - XFS_DEFAULT_LOG_COUNT); + error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0, + XFS_DEFAULT_LOG_COUNT); if (error) { xfs_trans_cancel(tp, 0); return error; diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index a815412eab80..515bf71ce01c 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -279,8 +279,6 @@ xfs_ialloc_ag_alloc( (args.agbno < be32_to_cpu(agi->agi_length)))) { args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); args.type = XFS_ALLOCTYPE_THIS_BNO; - args.mod = args.total = args.wasdel = args.isfl = - args.userdata = args.minalignslop = 0; args.prod = 1; /* @@ -333,8 +331,6 @@ xfs_ialloc_ag_alloc( * Allocate a fixed-size extent of inodes. */ args.type = XFS_ALLOCTYPE_NEAR_BNO; - args.mod = args.total = args.wasdel = args.isfl = - args.userdata = args.minalignslop = 0; args.prod = 1; /* * Allow space for the inode btree to split. diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 66282dcb821b..4f201656d2d9 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2379,9 +2379,6 @@ xfs_iflush_fork( char *cp; xfs_ifork_t *ifp; xfs_mount_t *mp; -#ifdef XFS_TRANS_DEBUG - int first; -#endif static const short brootflag[2] = { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT }; static const short dataflag[2] = @@ -2724,9 +2721,6 @@ xfs_iflush_int( xfs_inode_log_item_t *iip; xfs_dinode_t *dip; xfs_mount_t *mp; -#ifdef XFS_TRANS_DEBUG - int first; -#endif ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(xfs_isiflocked(ip)); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 22baf6ea4fac..237e7f6f2ab3 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -419,6 +419,7 @@ static inline void xfs_iflock(struct xfs_inode *ip) static inline void xfs_ifunlock(struct xfs_inode *ip) { xfs_iflags_clear(ip, XFS_IFLOCK); + smp_mb(); wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT); } diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index d041d47d9d86..f034bd1652f0 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -269,17 +269,6 @@ xfs_inode_item_format( } else { ASSERT(!(iip->ili_fields & XFS_ILOG_DBROOT)); -#ifdef XFS_TRANS_DEBUG - if (iip->ili_root_size > 0) { - ASSERT(iip->ili_root_size == - ip->i_df.if_broot_bytes); - ASSERT(memcmp(iip->ili_orig_root, - ip->i_df.if_broot, - iip->ili_root_size) == 0); - } else { - ASSERT(ip->i_df.if_broot_bytes == 0); - } -#endif iip->ili_fields &= ~XFS_ILOG_DBROOT; } break; @@ -678,11 +667,6 @@ void xfs_inode_item_destroy( xfs_inode_t *ip) { -#ifdef XFS_TRANS_DEBUG - if (ip->i_itemp->ili_root_size != 0) { - kmem_free(ip->i_itemp->ili_orig_root); - } -#endif kmem_zone_free(xfs_ili_zone, ip->i_itemp); } diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 376d4d0b2635..779812fb3d80 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -148,10 +148,6 @@ typedef struct xfs_inode_log_item { data exts */ struct xfs_bmbt_rec *ili_aextents_buf; /* array of logged attr exts */ -#ifdef XFS_TRANS_DEBUG - int ili_root_size; - char *ili_orig_root; -#endif xfs_inode_log_format_t ili_format; /* logged structure */ } xfs_inode_log_item_t; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index add06b4e9a63..912d83d8860a 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -311,6 +311,62 @@ xfs_iomap_eof_want_preallocate( } /* + * Determine the initial size of the preallocation. We are beyond the current + * EOF here, but we need to take into account whether this is a sparse write or + * an extending write when determining the preallocation size. Hence we need to + * look up the extent that ends at the current write offset and use the result + * to determine the preallocation size. + * + * If the extent is a hole, then preallocation is essentially disabled. + * Otherwise we take the size of the preceeding data extent as the basis for the + * preallocation size. If the size of the extent is greater than half the + * maximum extent length, then use the current offset as the basis. This ensures + * that for large files the preallocation size always extends to MAXEXTLEN + * rather than falling short due to things like stripe unit/width alignment of + * real extents. + */ +STATIC int +xfs_iomap_eof_prealloc_initial_size( + struct xfs_mount *mp, + struct xfs_inode *ip, + xfs_off_t offset, + xfs_bmbt_irec_t *imap, + int nimaps) +{ + xfs_fileoff_t start_fsb; + int imaps = 1; + int error; + + ASSERT(nimaps >= imaps); + + /* if we are using a specific prealloc size, return now */ + if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) + return 0; + + /* + * As we write multiple pages, the offset will always align to the + * start of a page and hence point to a hole at EOF. i.e. if the size is + * 4096 bytes, we only have one block at FSB 0, but XFS_B_TO_FSB(4096) + * will return FSB 1. Hence if there are blocks in the file, we want to + * point to the block prior to the EOF block and not the hole that maps + * directly at @offset. + */ + start_fsb = XFS_B_TO_FSB(mp, offset); + if (start_fsb) + start_fsb--; + error = xfs_bmapi_read(ip, start_fsb, 1, imap, &imaps, XFS_BMAPI_ENTIRE); + if (error) + return 0; + + ASSERT(imaps == 1); + if (imap[0].br_startblock == HOLESTARTBLOCK) + return 0; + if (imap[0].br_blockcount <= (MAXEXTLEN >> 1)) + return imap[0].br_blockcount; + return XFS_B_TO_FSB(mp, offset); +} + +/* * If we don't have a user specified preallocation size, dynamically increase * the preallocation size as the size of the file grows. Cap the maximum size * at a single extent or less if the filesystem is near full. The closer the @@ -319,20 +375,19 @@ xfs_iomap_eof_want_preallocate( STATIC xfs_fsblock_t xfs_iomap_prealloc_size( struct xfs_mount *mp, - struct xfs_inode *ip) + struct xfs_inode *ip, + xfs_off_t offset, + struct xfs_bmbt_irec *imap, + int nimaps) { xfs_fsblock_t alloc_blocks = 0; - if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) { + alloc_blocks = xfs_iomap_eof_prealloc_initial_size(mp, ip, offset, + imap, nimaps); + if (alloc_blocks > 0) { int shift = 0; int64_t freesp; - /* - * rounddown_pow_of_two() returns an undefined result - * if we pass in alloc_blocks = 0. Hence the "+ 1" to - * ensure we always pass in a non-zero value. - */ - alloc_blocks = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)) + 1; alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN, rounddown_pow_of_two(alloc_blocks)); @@ -351,6 +406,15 @@ xfs_iomap_prealloc_size( } if (shift) alloc_blocks >>= shift; + + /* + * If we are still trying to allocate more space than is + * available, squash the prealloc hard. This can happen if we + * have a large file on a small filesystem and the above + * lowspace thresholds are smaller than MAXEXTLEN. + */ + while (alloc_blocks >= freesp) + alloc_blocks >>= 4; } if (alloc_blocks < mp->m_writeio_blocks) @@ -390,7 +454,6 @@ xfs_iomap_write_delay( extsz = xfs_get_extsz_hint(ip); offset_fsb = XFS_B_TO_FSBT(mp, offset); - error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count, imap, XFS_WRITE_IMAPS, &prealloc); if (error) @@ -398,7 +461,10 @@ xfs_iomap_write_delay( retry: if (prealloc) { - xfs_fsblock_t alloc_blocks = xfs_iomap_prealloc_size(mp, ip); + xfs_fsblock_t alloc_blocks; + + alloc_blocks = xfs_iomap_prealloc_size(mp, ip, offset, imap, + XFS_WRITE_IMAPS); aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); ioalign = XFS_B_TO_FSBT(mp, aligned_offset); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 46bd9d52ab51..eec226f78a40 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -120,7 +120,7 @@ xlog_verify_iclog( struct xlog *log, struct xlog_in_core *iclog, int count, - boolean_t syncing); + bool syncing); STATIC void xlog_verify_tail_lsn( struct xlog *log, @@ -1737,7 +1737,7 @@ xlog_sync( ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); - xlog_verify_iclog(log, iclog, count, B_TRUE); + xlog_verify_iclog(log, iclog, count, true); /* account for log which doesn't start at block #0 */ XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); @@ -3611,7 +3611,7 @@ xlog_verify_iclog( struct xlog *log, struct xlog_in_core *iclog, int count, - boolean_t syncing) + bool syncing) { xlog_op_header_t *ophead; xlog_in_core_t *icptr; @@ -3659,7 +3659,7 @@ xlog_verify_iclog( /* clientid is only 1 byte */ field_offset = (__psint_t) ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr); - if (syncing == B_FALSE || (field_offset & 0x1ff)) { + if (!syncing || (field_offset & 0x1ff)) { clientid = ophead->oh_clientid; } else { idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap); @@ -3682,7 +3682,7 @@ xlog_verify_iclog( /* check length */ field_offset = (__psint_t) ((xfs_caddr_t)&(ophead->oh_len) - base_ptr); - if (syncing == B_FALSE || (field_offset & 0x1ff)) { + if (!syncing || (field_offset & 0x1ff)) { op_len = be32_to_cpu(ophead->oh_len); } else { idx = BTOBBT((__psint_t)&ophead->oh_len - diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index da508463ff10..3806088a8f77 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -658,7 +658,7 @@ xfs_sb_quiet_read_verify( return; } /* quietly fail */ - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, EWRONGFS); } static void @@ -1109,8 +1109,8 @@ xfs_mount_reset_sbqflags( return 0; tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); - error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, - XFS_DEFAULT_LOG_COUNT); + error = xfs_trans_reserve(tp, 0, XFS_QM_SBCHANGE_LOG_RES(mp), + 0, 0, XFS_DEFAULT_LOG_COUNT); if (error) { xfs_trans_cancel(tp, 0); xfs_alert(mp, "%s: Superblock update failed!", __func__); @@ -1583,8 +1583,8 @@ xfs_log_sbcount(xfs_mount_t *mp) return 0; tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP); - error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, - XFS_DEFAULT_LOG_COUNT); + error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0, + XFS_DEFAULT_LOG_COUNT); if (error) { xfs_trans_cancel(tp, 0); return error; @@ -1945,8 +1945,8 @@ xfs_mount_log_sb( XFS_SB_VERSIONNUM)); tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT); - error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, - XFS_DEFAULT_LOG_COUNT); + error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0, + XFS_DEFAULT_LOG_COUNT); if (error) { xfs_trans_cancel(tp, 0); return error; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index bab8314507e4..bc907061d392 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -34,12 +34,19 @@ typedef struct xfs_trans_reservations { uint tr_addafork; /* cvt inode to attributed trans */ uint tr_writeid; /* write setuid/setgid file */ uint tr_attrinval; /* attr fork buffer invalidation */ - uint tr_attrset; /* set/create an attribute */ + uint tr_attrsetm; /* set/create an attribute at mount time */ + uint tr_attrsetrt; /* set/create an attribute at runtime */ uint tr_attrrm; /* remove an attribute */ uint tr_clearagi; /* clear bad agi unlinked ino bucket */ uint tr_growrtalloc; /* grow realtime allocations */ uint tr_growrtzero; /* grow realtime zeroing */ uint tr_growrtfree; /* grow realtime freeing */ + uint tr_qm_sbchange; /* change quota flags */ + uint tr_qm_setqlim; /* adjust quota limits */ + uint tr_qm_dqalloc; /* allocate quota on disk */ + uint tr_qm_quotaoff; /* turn quota off */ + uint tr_qm_equotaoff;/* end of turn quota off */ + uint tr_sb; /* modify superblock */ } xfs_trans_reservations_t; #ifndef __KERNEL__ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 60eff4763156..e5b5cf973781 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1584,10 +1584,9 @@ xfs_qm_write_sb_changes( int error; tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); - if ((error = xfs_trans_reserve(tp, 0, - mp->m_sb.sb_sectsize + 128, 0, - 0, - XFS_DEFAULT_LOG_COUNT))) { + error = xfs_trans_reserve(tp, 0, XFS_QM_SBCHANGE_LOG_RES(mp), + 0, 0, XFS_DEFAULT_LOG_COUNT); + if (error) { xfs_trans_cancel(tp, 0); return error; } diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index 6b39115bf145..2d02eac1c9a8 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -146,7 +146,7 @@ xfs_qm_newmount( * inode goes inactive and wants to free blocks, * or via xfs_log_mount_finish. */ - *needquotamount = B_TRUE; + *needquotamount = true; *quotaflags = mp->m_qflags; mp->m_qflags = 0; } diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 5f53e75409b8..cf9a34051e07 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -408,10 +408,10 @@ xfs_qm_scall_getqstat( { struct xfs_quotainfo *q = mp->m_quotainfo; struct xfs_inode *uip, *gip; - boolean_t tempuqip, tempgqip; + bool tempuqip, tempgqip; uip = gip = NULL; - tempuqip = tempgqip = B_FALSE; + tempuqip = tempgqip = false; memset(out, 0, sizeof(fs_quota_stat_t)); out->qs_version = FS_QSTAT_VERSION; @@ -434,12 +434,12 @@ xfs_qm_scall_getqstat( if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) { if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &uip) == 0) - tempuqip = B_TRUE; + tempuqip = true; } if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) { if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &gip) == 0) - tempgqip = B_TRUE; + tempgqip = true; } if (uip) { out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks; @@ -490,8 +490,9 @@ xfs_qm_scall_setqlim( return 0; tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); - if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128, - 0, 0, XFS_DEFAULT_LOG_COUNT))) { + error = xfs_trans_reserve(tp, 0, XFS_QM_SETQLIM_LOG_RES(mp), + 0, 0, XFS_DEFAULT_LOG_COUNT); + if (error) { xfs_trans_cancel(tp, 0); return (error); } @@ -638,8 +639,9 @@ xfs_qm_log_quotaoff_end( tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END); - if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_qoff_logitem_t) * 2, - 0, 0, XFS_DEFAULT_LOG_COUNT))) { + error = xfs_trans_reserve(tp, 0, XFS_QM_QUOTAOFF_END_LOG_RES(mp), + 0, 0, XFS_DEFAULT_LOG_COUNT); + if (error) { xfs_trans_cancel(tp, 0); return (error); } @@ -671,14 +673,10 @@ xfs_qm_log_quotaoff( uint oldsbqflag=0; tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF); - if ((error = xfs_trans_reserve(tp, 0, - sizeof(xfs_qoff_logitem_t) * 2 + - mp->m_sb.sb_sectsize + 128, - 0, - 0, - XFS_DEFAULT_LOG_COUNT))) { + error = xfs_trans_reserve(tp, 0, XFS_QM_QUOTAOFF_LOG_RES(mp), + 0, 0, XFS_DEFAULT_LOG_COUNT); + if (error) goto error0; - } qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT); xfs_trans_log_quotaoff_item(tp, qoffi); @@ -784,11 +782,11 @@ xfs_qm_scall_getquota( (XFS_IS_OQUOTA_ENFORCED(mp) && (dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) && dst->d_id != 0) { - if (((int) dst->d_bcount > (int) dst->d_blk_softlimit) && + if ((dst->d_bcount > dst->d_blk_softlimit) && (dst->d_blk_softlimit > 0)) { ASSERT(dst->d_btimer != 0); } - if (((int) dst->d_icount > (int) dst->d_ino_softlimit) && + if ((dst->d_icount > dst->d_ino_softlimit) && (dst->d_ino_softlimit > 0)) { ASSERT(dst->d_itimer != 0); } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index ab8839b26272..c407121873b4 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -139,9 +139,9 @@ static const match_table_t tokens = { STATIC unsigned long -suffix_strtoul(char *s, char **endp, unsigned int base) +suffix_kstrtoint(char *s, unsigned int base, int *res) { - int last, shift_left_factor = 0; + int last, shift_left_factor = 0, _res; char *value = s; last = strlen(value) - 1; @@ -158,7 +158,10 @@ suffix_strtoul(char *s, char **endp, unsigned int base) value[last] = '\0'; } - return simple_strtoul((const char *)s, endp, base) << shift_left_factor; + if (kstrtoint(s, base, &_res)) + return -EINVAL; + *res = _res << shift_left_factor; + return 0; } /* @@ -174,7 +177,7 @@ xfs_parseargs( char *options) { struct super_block *sb = mp->m_super; - char *this_char, *value, *eov; + char *this_char, *value; int dsunit = 0; int dswidth = 0; int iosize = 0; @@ -230,14 +233,16 @@ xfs_parseargs( this_char); return EINVAL; } - mp->m_logbufs = simple_strtoul(value, &eov, 10); + if (kstrtoint(value, 10, &mp->m_logbufs)) + return EINVAL; } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) { if (!value || !*value) { xfs_warn(mp, "%s option requires an argument", this_char); return EINVAL; } - mp->m_logbsize = suffix_strtoul(value, &eov, 10); + if (suffix_kstrtoint(value, 10, &mp->m_logbsize)) + return EINVAL; } else if (!strcmp(this_char, MNTOPT_LOGDEV)) { if (!value || !*value) { xfs_warn(mp, "%s option requires an argument", @@ -266,7 +271,8 @@ xfs_parseargs( this_char); return EINVAL; } - iosize = simple_strtoul(value, &eov, 10); + if (kstrtoint(value, 10, &iosize)) + return EINVAL; iosizelog = ffs(iosize) - 1; } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) { if (!value || !*value) { @@ -274,7 +280,8 @@ xfs_parseargs( this_char); return EINVAL; } - iosize = suffix_strtoul(value, &eov, 10); + if (suffix_kstrtoint(value, 10, &iosize)) + return EINVAL; iosizelog = ffs(iosize) - 1; } else if (!strcmp(this_char, MNTOPT_GRPID) || !strcmp(this_char, MNTOPT_BSDGROUPS)) { @@ -296,14 +303,16 @@ xfs_parseargs( this_char); return EINVAL; } - dsunit = simple_strtoul(value, &eov, 10); + if (kstrtoint(value, 10, &dsunit)) + return EINVAL; } else if (!strcmp(this_char, MNTOPT_SWIDTH)) { if (!value || !*value) { xfs_warn(mp, "%s option requires an argument", this_char); return EINVAL; } - dswidth = simple_strtoul(value, &eov, 10); + if (kstrtoint(value, 10, &dswidth)) + return EINVAL; } else if (!strcmp(this_char, MNTOPT_32BITINODE)) { mp->m_flags |= XFS_MOUNT_SMALL_INUMS; } else if (!strcmp(this_char, MNTOPT_64BITINODE)) { diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 2e137d4a85ae..16a812977eab 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -341,6 +341,7 @@ DEFINE_BUF_EVENT(xfs_buf_item_relse); DEFINE_BUF_EVENT(xfs_buf_item_iodone); DEFINE_BUF_EVENT(xfs_buf_item_iodone_async); DEFINE_BUF_EVENT(xfs_buf_error_relse); +DEFINE_BUF_EVENT(xfs_buf_wait_buftarg); DEFINE_BUF_EVENT(xfs_trans_read_buf_io); DEFINE_BUF_EVENT(xfs_trans_read_buf_shut); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 06ed520a767f..2fd7c1ff1d21 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -37,14 +37,45 @@ #include "xfs_extent_busy.h" #include "xfs_bmap.h" #include "xfs_quota.h" +#include "xfs_qm.h" #include "xfs_trans_priv.h" #include "xfs_trans_space.h" #include "xfs_inode_item.h" +#include "xfs_log_priv.h" +#include "xfs_buf_item.h" #include "xfs_trace.h" kmem_zone_t *xfs_trans_zone; kmem_zone_t *xfs_log_item_desc_zone; +/* + * A buffer has a format structure overhead in the log in addition + * to the data, so we need to take this into account when reserving + * space in a transaction for a buffer. Round the space required up + * to a multiple of 128 bytes so that we don't change the historical + * reservation that has been used for this overhead. + */ +STATIC uint +xfs_buf_log_overhead(void) +{ + return round_up(sizeof(struct xlog_op_header) + + sizeof(struct xfs_buf_log_format), 128); +} + +/* + * Calculate out transaction log reservation per item in bytes. + * + * The nbufs argument is used to indicate the number of items that + * will be changed in a transaction. size is used to tell how many + * bytes should be reserved per item. + */ +STATIC uint +xfs_calc_buf_res( + uint nbufs, + uint size) +{ + return nbufs * (size + xfs_buf_log_overhead()); +} /* * Various log reservation values. @@ -85,18 +116,15 @@ xfs_calc_write_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - MAX((mp->m_sb.sb_inodesize + - XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + - 2 * mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_ALLOCFREE_LOG_RES(mp, 2) + - 128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + - XFS_ALLOCFREE_LOG_COUNT(mp, 2))), - (2 * mp->m_sb.sb_sectsize + - 2 * mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_ALLOCFREE_LOG_RES(mp, 2) + - 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))); + MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + XFS_FSB_TO_B(mp, 1)))); } /* @@ -117,18 +145,17 @@ xfs_calc_itruncate_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - MAX((mp->m_sb.sb_inodesize + - XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) + - 128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))), - (4 * mp->m_sb.sb_sectsize + - 4 * mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_ALLOCFREE_LOG_RES(mp, 4) + - 128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)) + - 128 * 5 + - XFS_ALLOCFREE_LOG_RES(mp, 1) + - 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels + - XFS_ALLOCFREE_LOG_COUNT(mp, 1)))); + MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(5, 0) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) + + mp->m_in_maxlevels, 0))); } /* @@ -148,14 +175,12 @@ xfs_calc_rename_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - MAX((4 * mp->m_sb.sb_inodesize + - 2 * XFS_DIROP_LOG_RES(mp) + - 128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp))), - (3 * mp->m_sb.sb_sectsize + - 3 * mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_ALLOCFREE_LOG_RES(mp, 3) + - 128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3)))); + MAX((xfs_calc_buf_res(4, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 3), + XFS_FSB_TO_B(mp, 1)))); } /* @@ -175,15 +200,12 @@ xfs_calc_link_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - MAX((mp->m_sb.sb_inodesize + - mp->m_sb.sb_inodesize + - XFS_DIROP_LOG_RES(mp) + - 128 * (2 + XFS_DIROP_LOG_COUNT(mp))), - (mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_ALLOCFREE_LOG_RES(mp, 1) + - 128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1)))); + MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)))); } /* @@ -203,15 +225,12 @@ xfs_calc_remove_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - MAX((mp->m_sb.sb_inodesize + - mp->m_sb.sb_inodesize + - XFS_DIROP_LOG_RES(mp) + - 128 * (2 + XFS_DIROP_LOG_COUNT(mp))), - (2 * mp->m_sb.sb_sectsize + - 2 * mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_ALLOCFREE_LOG_RES(mp, 2) + - 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))); + MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + XFS_FSB_TO_B(mp, 1)))); } /* @@ -233,18 +252,18 @@ xfs_calc_symlink_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - MAX((mp->m_sb.sb_inodesize + - mp->m_sb.sb_inodesize + - XFS_FSB_TO_B(mp, 1) + - XFS_DIROP_LOG_RES(mp) + - 1024 + - 128 * (4 + XFS_DIROP_LOG_COUNT(mp))), - (2 * mp->m_sb.sb_sectsize + - XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) + - XFS_FSB_TO_B(mp, mp->m_in_maxlevels) + - XFS_ALLOCFREE_LOG_RES(mp, 1) + - 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels + - XFS_ALLOCFREE_LOG_COUNT(mp, 1)))); + MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(1, 1024)), + (xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(mp->m_in_maxlevels, + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)))); } /* @@ -267,18 +286,19 @@ xfs_calc_create_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - MAX((mp->m_sb.sb_inodesize + - mp->m_sb.sb_inodesize + + MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + (uint)XFS_FSB_TO_B(mp, 1) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + mp->m_sb.sb_sectsize + - XFS_FSB_TO_B(mp, 1) + - XFS_DIROP_LOG_RES(mp) + - 128 * (3 + XFS_DIROP_LOG_COUNT(mp))), - (3 * mp->m_sb.sb_sectsize + - XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) + - XFS_FSB_TO_B(mp, mp->m_in_maxlevels) + - XFS_ALLOCFREE_LOG_RES(mp, 1) + - 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels + - XFS_ALLOCFREE_LOG_COUNT(mp, 1)))); + xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(mp->m_in_maxlevels, + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)))); } /* @@ -306,16 +326,16 @@ xfs_calc_ifree_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - mp->m_sb.sb_inodesize + - mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_FSB_TO_B(mp, 1) + + xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) + MAX((__uint16_t)XFS_FSB_TO_B(mp, 1), XFS_INODE_CLUSTER_SIZE(mp)) + - 128 * 5 + - XFS_ALLOCFREE_LOG_RES(mp, 1) + - 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels + - XFS_ALLOCFREE_LOG_COUNT(mp, 1)); + xfs_calc_buf_res(1, 0) + + xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) + + mp->m_in_maxlevels, 0) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); } /* @@ -343,9 +363,9 @@ STATIC uint xfs_calc_growdata_reservation( struct xfs_mount *mp) { - return mp->m_sb.sb_sectsize * 3 + - XFS_ALLOCFREE_LOG_RES(mp, 1) + - 128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1)); + return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); } /* @@ -362,12 +382,12 @@ STATIC uint xfs_calc_growrtalloc_reservation( struct xfs_mount *mp) { - return 2 * mp->m_sb.sb_sectsize + - XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + - mp->m_sb.sb_inodesize + - XFS_ALLOCFREE_LOG_RES(mp, 1) + - 128 * (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + - XFS_ALLOCFREE_LOG_COUNT(mp, 1)); + return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); } /* @@ -379,7 +399,7 @@ STATIC uint xfs_calc_growrtzero_reservation( struct xfs_mount *mp) { - return mp->m_sb.sb_blocksize + 128; + return xfs_calc_buf_res(1, mp->m_sb.sb_blocksize); } /* @@ -396,11 +416,10 @@ STATIC uint xfs_calc_growrtfree_reservation( struct xfs_mount *mp) { - return mp->m_sb.sb_sectsize + - 2 * mp->m_sb.sb_inodesize + - mp->m_sb.sb_blocksize + - mp->m_rsumsize + - 128 * 5; + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) + + xfs_calc_buf_res(1, mp->m_rsumsize); } /* @@ -411,7 +430,7 @@ STATIC uint xfs_calc_swrite_reservation( struct xfs_mount *mp) { - return mp->m_sb.sb_inodesize + 128; + return xfs_calc_buf_res(1, mp->m_sb.sb_inodesize); } /* @@ -421,7 +440,7 @@ xfs_calc_swrite_reservation( STATIC uint xfs_calc_writeid_reservation(xfs_mount_t *mp) { - return mp->m_sb.sb_inodesize + 128; + return xfs_calc_buf_res(1, mp->m_sb.sb_inodesize); } /* @@ -437,13 +456,13 @@ xfs_calc_addafork_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - mp->m_sb.sb_inodesize + - mp->m_sb.sb_sectsize * 2 + - mp->m_dirblksize + - XFS_FSB_TO_B(mp, XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) + - XFS_ALLOCFREE_LOG_RES(mp, 1) + - 128 * (4 + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1 + - XFS_ALLOCFREE_LOG_COUNT(mp, 1)); + xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(1, mp->m_dirblksize) + + xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1, + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); } /* @@ -461,35 +480,51 @@ STATIC uint xfs_calc_attrinval_reservation( struct xfs_mount *mp) { - return MAX((mp->m_sb.sb_inodesize + - XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + - 128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))), - (4 * mp->m_sb.sb_sectsize + - 4 * mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_ALLOCFREE_LOG_RES(mp, 4) + - 128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)))); + return MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4), + XFS_FSB_TO_B(mp, 1)))); } /* - * Setting an attribute. + * Setting an attribute at mount time. * the inode getting the attribute * the superblock for allocations * the agfs extents are allocated from * the attribute btree * max depth * the inode allocation btree * Since attribute transaction space is dependent on the size of the attribute, - * the calculation is done partially at mount time and partially at runtime. + * the calculation is done partially at mount time and partially at runtime(see + * below). */ STATIC uint -xfs_calc_attrset_reservation( +xfs_calc_attrsetm_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - mp->m_sb.sb_inodesize + - mp->m_sb.sb_sectsize + - XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) + - 128 * (2 + XFS_DA_NODE_MAXDEPTH); + xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1)); +} + +/* + * Setting an attribute at runtime, transaction space unit per block. + * the superblock for allocations: sector size + * the inode bmap btree could join or split: max depth * block size + * Since the runtime attribute transaction space is dependent on the total + * blocks needed for the 1st bmap, here we calculate out the space unit for + * one block so that the caller could figure out the total space according + * to the attibute extent length in blocks by: ext * XFS_ATTRSETRT_LOG_RES(mp). + */ +STATIC uint +xfs_calc_attrsetrt_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK), + XFS_FSB_TO_B(mp, 1)); } /* @@ -508,16 +543,15 @@ xfs_calc_attrrm_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - MAX((mp->m_sb.sb_inodesize + - XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) + - XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + - 128 * (1 + XFS_DA_NODE_MAXDEPTH + - XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))), - (2 * mp->m_sb.sb_sectsize + - 2 * mp->m_sb.sb_sectsize + - mp->m_sb.sb_sectsize + - XFS_ALLOCFREE_LOG_RES(mp, 2) + - 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))); + MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, + XFS_FSB_TO_B(mp, 1)) + + (uint)XFS_FSB_TO_B(mp, + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)), + (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + XFS_FSB_TO_B(mp, 1)))); } /* @@ -527,7 +561,78 @@ STATIC uint xfs_calc_clear_agi_bucket_reservation( struct xfs_mount *mp) { - return mp->m_sb.sb_sectsize + 128; + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); +} + +/* + * Clearing the quotaflags in the superblock. + * the super block for changing quota flags: sector size + */ +STATIC uint +xfs_calc_qm_sbchange_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); +} + +/* + * Adjusting quota limits. + * the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot) + */ +STATIC uint +xfs_calc_qm_setqlim_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot)); +} + +/* + * Allocating quota on disk if needed. + * the write transaction log space: XFS_WRITE_LOG_RES(mp) + * the unit of quota allocation: one system block size + */ +STATIC uint +xfs_calc_qm_dqalloc_reservation( + struct xfs_mount *mp) +{ + return XFS_WRITE_LOG_RES(mp) + + xfs_calc_buf_res(1, + XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1); +} + +/* + * Turning off quotas. + * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2 + * the superblock for the quota flags: sector size + */ +STATIC uint +xfs_calc_qm_quotaoff_reservation( + struct xfs_mount *mp) +{ + return sizeof(struct xfs_qoff_logitem) * 2 + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); +} + +/* + * End of turning off quotas. + * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2 + */ +STATIC uint +xfs_calc_qm_quotaoff_end_reservation( + struct xfs_mount *mp) +{ + return sizeof(struct xfs_qoff_logitem) * 2; +} + +/* + * Syncing the incore super block changes to disk. + * the super block to reflect the changes: sector size + */ +STATIC uint +xfs_calc_sb_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); } /* @@ -555,12 +660,19 @@ xfs_trans_init( resp->tr_writeid = xfs_calc_writeid_reservation(mp); resp->tr_addafork = xfs_calc_addafork_reservation(mp); resp->tr_attrinval = xfs_calc_attrinval_reservation(mp); - resp->tr_attrset = xfs_calc_attrset_reservation(mp); + resp->tr_attrsetm = xfs_calc_attrsetm_reservation(mp); + resp->tr_attrsetrt = xfs_calc_attrsetrt_reservation(mp); resp->tr_attrrm = xfs_calc_attrrm_reservation(mp); resp->tr_clearagi = xfs_calc_clear_agi_bucket_reservation(mp); resp->tr_growrtalloc = xfs_calc_growrtalloc_reservation(mp); resp->tr_growrtzero = xfs_calc_growrtzero_reservation(mp); resp->tr_growrtfree = xfs_calc_growrtfree_reservation(mp); + resp->tr_qm_sbchange = xfs_calc_qm_sbchange_reservation(mp); + resp->tr_qm_setqlim = xfs_calc_qm_setqlim_reservation(mp); + resp->tr_qm_dqalloc = xfs_calc_qm_dqalloc_reservation(mp); + resp->tr_qm_quotaoff = xfs_calc_qm_quotaoff_reservation(mp); + resp->tr_qm_equotaoff = xfs_calc_qm_quotaoff_end_reservation(mp); + resp->tr_sb = xfs_calc_sb_reservation(mp); } /* diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index c6c0601abd7a..cd29f6171021 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -252,17 +252,19 @@ struct xfs_log_item_desc { * as long as SWRITE logs the entire inode core */ #define XFS_FSYNC_TS_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) -#define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) +#define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) #define XFS_ADDAFORK_LOG_RES(mp) ((mp)->m_reservations.tr_addafork) #define XFS_ATTRINVAL_LOG_RES(mp) ((mp)->m_reservations.tr_attrinval) -#define XFS_ATTRSET_LOG_RES(mp, ext) \ - ((mp)->m_reservations.tr_attrset + \ - (ext * (mp)->m_sb.sb_sectsize) + \ - (ext * XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))) + \ - (128 * (ext + (ext * XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))))) -#define XFS_ATTRRM_LOG_RES(mp) ((mp)->m_reservations.tr_attrrm) +#define XFS_ATTRSETM_LOG_RES(mp) ((mp)->m_reservations.tr_attrsetm) +#define XFS_ATTRSETRT_LOG_RES(mp) ((mp)->m_reservations.tr_attrsetrt) +#define XFS_ATTRRM_LOG_RES(mp) ((mp)->m_reservations.tr_attrrm) #define XFS_CLEAR_AGI_BUCKET_LOG_RES(mp) ((mp)->m_reservations.tr_clearagi) - +#define XFS_QM_SBCHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_qm_sbchange) +#define XFS_QM_SETQLIM_LOG_RES(mp) ((mp)->m_reservations.tr_qm_setqlim) +#define XFS_QM_DQALLOC_LOG_RES(mp) ((mp)->m_reservations.tr_qm_dqalloc) +#define XFS_QM_QUOTAOFF_LOG_RES(mp) ((mp)->m_reservations.tr_qm_quotaoff) +#define XFS_QM_QUOTAOFF_END_LOG_RES(mp) ((mp)->m_reservations.tr_qm_equotaoff) +#define XFS_SB_LOG_RES(mp) ((mp)->m_reservations.tr_sb) /* * Various log count values. diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 6011ee661339..0eda7254305f 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -55,20 +55,6 @@ xfs_ail_check( ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0); -#ifdef XFS_TRANS_DEBUG - /* - * Walk the list checking lsn ordering, and that every entry has the - * XFS_LI_IN_AIL flag set. This is really expensive, so only do it - * when specifically debugging the transaction subsystem. - */ - prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail); - list_for_each_entry(lip, &ailp->xa_ail, li_ail) { - if (&prev_lip->li_ail != &ailp->xa_ail) - ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0); - ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0); - prev_lip = lip; - } -#endif /* XFS_TRANS_DEBUG */ } #else /* !DEBUG */ #define xfs_ail_check(a,l) diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 4fc17d479d42..3edf5dbee001 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -93,7 +93,7 @@ _xfs_trans_bjoin( xfs_buf_item_init(bp, tp->t_mountp); bip = bp->b_fspriv; ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); if (reset_recur) bip->bli_recur = 0; @@ -432,7 +432,7 @@ xfs_trans_brelse(xfs_trans_t *tp, bip = bp->b_fspriv; ASSERT(bip->bli_item.li_type == XFS_LI_BUF); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); trace_xfs_trans_brelse(bip); @@ -519,7 +519,7 @@ xfs_trans_bhold(xfs_trans_t *tp, ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_HOLD; @@ -539,7 +539,7 @@ xfs_trans_bhold_release(xfs_trans_t *tp, ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT(bip->bli_flags & XFS_BLI_HOLD); @@ -598,7 +598,7 @@ xfs_trans_log_buf(xfs_trans_t *tp, bip->bli_flags &= ~XFS_BLI_STALE; ASSERT(XFS_BUF_ISSTALE(bp)); XFS_BUF_UNSTALE(bp); - bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL; + bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL; } tp->t_flags |= XFS_TRANS_DIRTY; @@ -643,6 +643,7 @@ xfs_trans_binval( xfs_buf_t *bp) { xfs_buf_log_item_t *bip = bp->b_fspriv; + int i; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); @@ -657,8 +658,8 @@ xfs_trans_binval( */ ASSERT(XFS_BUF_ISSTALE(bp)); ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF)); - ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF)); + ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY); ASSERT(tp->t_flags & XFS_TRANS_DIRTY); return; @@ -668,10 +669,12 @@ xfs_trans_binval( bip->bli_flags |= XFS_BLI_STALE; bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); - bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; - bip->bli_format.blf_flags |= XFS_BLF_CANCEL; - memset((char *)(bip->bli_format.blf_data_map), 0, - (bip->bli_format.blf_map_size * sizeof(uint))); + bip->__bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; + bip->__bli_format.blf_flags |= XFS_BLF_CANCEL; + for (i = 0; i < bip->bli_format_count; i++) { + memset(bip->bli_formats[i].blf_data_map, 0, + (bip->bli_formats[i].blf_map_size * sizeof(uint))); + } bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; tp->t_flags |= XFS_TRANS_DIRTY; } @@ -775,5 +778,5 @@ xfs_trans_dquot_buf( type == XFS_BLF_GDQUOT_BUF); ASSERT(atomic_read(&bip->bli_refcount) > 0); - bip->bli_format.blf_flags |= type; + bip->__bli_format.blf_flags |= type; } diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 0c7fa54f309e..642c2d6e1db1 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -516,7 +516,7 @@ xfs_trans_unreserve_and_mod_dquots( int i, j; xfs_dquot_t *dqp; xfs_dqtrx_t *qtrx, *qa; - boolean_t locked; + bool locked; if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY)) return; @@ -537,17 +537,17 @@ xfs_trans_unreserve_and_mod_dquots( * about the number of blocks used field, or deltas. * Also we don't bother to zero the fields. */ - locked = B_FALSE; + locked = false; if (qtrx->qt_blk_res) { xfs_dqlock(dqp); - locked = B_TRUE; + locked = true; dqp->q_res_bcount -= (xfs_qcnt_t)qtrx->qt_blk_res; } if (qtrx->qt_ino_res) { if (!locked) { xfs_dqlock(dqp); - locked = B_TRUE; + locked = true; } dqp->q_res_icount -= (xfs_qcnt_t)qtrx->qt_ino_res; @@ -556,7 +556,7 @@ xfs_trans_unreserve_and_mod_dquots( if (qtrx->qt_rtblk_res) { if (!locked) { xfs_dqlock(dqp); - locked = B_TRUE; + locked = true; } dqp->q_res_rtbcount -= (xfs_qcnt_t)qtrx->qt_rtblk_res; diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index d2eee20d5f5b..ac6d567704db 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -33,14 +33,6 @@ #include "xfs_inode_item.h" #include "xfs_trace.h" -#ifdef XFS_TRANS_DEBUG -STATIC void -xfs_trans_inode_broot_debug( - xfs_inode_t *ip); -#else -#define xfs_trans_inode_broot_debug(ip) -#endif - /* * Add a locked inode to the transaction. * @@ -67,8 +59,6 @@ xfs_trans_ijoin( * Get a log_item_desc to point at the new item. */ xfs_trans_add_item(tp, &iip->ili_item); - - xfs_trans_inode_broot_debug(ip); } /* @@ -135,34 +125,3 @@ xfs_trans_log_inode( flags |= ip->i_itemp->ili_last_fields; ip->i_itemp->ili_fields |= flags; } - -#ifdef XFS_TRANS_DEBUG -/* - * Keep track of the state of the inode btree root to make sure we - * log it properly. - */ -STATIC void -xfs_trans_inode_broot_debug( - xfs_inode_t *ip) -{ - xfs_inode_log_item_t *iip; - - ASSERT(ip->i_itemp != NULL); - iip = ip->i_itemp; - if (iip->ili_root_size != 0) { - ASSERT(iip->ili_orig_root != NULL); - kmem_free(iip->ili_orig_root); - iip->ili_root_size = 0; - iip->ili_orig_root = NULL; - } - if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { - ASSERT((ip->i_df.if_broot != NULL) && - (ip->i_df.if_broot_bytes > 0)); - iip->ili_root_size = ip->i_df.if_broot_bytes; - iip->ili_orig_root = - (char*)kmem_alloc(iip->ili_root_size, KM_SLEEP); - memcpy(iip->ili_orig_root, (char*)(ip->i_df.if_broot), - iip->ili_root_size); - } -} -#endif diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h index 7a41874f4c20..61ba1cfa974c 100644 --- a/fs/xfs/xfs_types.h +++ b/fs/xfs/xfs_types.h @@ -32,7 +32,6 @@ typedef unsigned int __uint32_t; typedef signed long long int __int64_t; typedef unsigned long long int __uint64_t; -typedef enum { B_FALSE,B_TRUE } boolean_t; typedef __uint32_t prid_t; /* project ID */ typedef __uint32_t inst_t; /* an instruction */ diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index d95f565a390e..77ad74834baa 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -725,7 +725,7 @@ xfs_create( int error; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; - boolean_t unlock_dp_on_error = B_FALSE; + bool unlock_dp_on_error = false; uint cancel_flags; int committed; prid_t prid; @@ -794,7 +794,7 @@ xfs_create( } xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); - unlock_dp_on_error = B_TRUE; + unlock_dp_on_error = true; xfs_bmap_init(&free_list, &first_block); @@ -830,7 +830,7 @@ xfs_create( * error path. */ xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - unlock_dp_on_error = B_FALSE; + unlock_dp_on_error = false; error = xfs_dir_createname(tp, dp, name, ip->i_ino, &first_block, &free_list, resblks ? @@ -1367,7 +1367,7 @@ xfs_symlink( int pathlen; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; - boolean_t unlock_dp_on_error = B_FALSE; + bool unlock_dp_on_error = false; uint cancel_flags; int committed; xfs_fileoff_t first_fsb; @@ -1438,7 +1438,7 @@ xfs_symlink( } xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); - unlock_dp_on_error = B_TRUE; + unlock_dp_on_error = true; /* * Check whether the directory allows new symlinks or not. @@ -1484,7 +1484,7 @@ xfs_symlink( * error path. */ xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - unlock_dp_on_error = B_FALSE; + unlock_dp_on_error = false; /* * Also attach the dquot(s) to it, if applicable. |