diff options
Diffstat (limited to 'fs')
348 files changed, 9555 insertions, 7554 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 332b5ff02fec..f7003cfac63d 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -76,7 +76,7 @@ static const match_table_t tokens = { * Return 0 upon success, -ERRNO upon failure. */ -static int v9fs_parse_options(struct v9fs_session_info *v9ses) +static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) { char *options; substring_t args[MAX_OPT_ARGS]; @@ -90,10 +90,10 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses) v9ses->debug = 0; v9ses->cache = 0; - if (!v9ses->options) + if (!opts) return 0; - options = kstrdup(v9ses->options, GFP_KERNEL); + options = kstrdup(opts, GFP_KERNEL); if (!options) { P9_DPRINTK(P9_DEBUG_ERROR, "failed to allocate copy of option string\n"); @@ -206,24 +206,14 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, v9ses->uid = ~0; v9ses->dfltuid = V9FS_DEFUID; v9ses->dfltgid = V9FS_DEFGID; - if (data) { - v9ses->options = kstrdup(data, GFP_KERNEL); - if (!v9ses->options) { - P9_DPRINTK(P9_DEBUG_ERROR, - "failed to allocate copy of option string\n"); - retval = -ENOMEM; - goto error; - } - } - rc = v9fs_parse_options(v9ses); + rc = v9fs_parse_options(v9ses, data); if (rc < 0) { retval = rc; goto error; } - v9ses->clnt = p9_client_create(dev_name, v9ses->options); - + v9ses->clnt = p9_client_create(dev_name, data); if (IS_ERR(v9ses->clnt)) { retval = PTR_ERR(v9ses->clnt); v9ses->clnt = NULL; @@ -280,7 +270,6 @@ void v9fs_session_close(struct v9fs_session_info *v9ses) __putname(v9ses->uname); __putname(v9ses->aname); - kfree(v9ses->options); } /** diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index a7d567192998..38762bf102a9 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -85,7 +85,6 @@ struct v9fs_session_info { unsigned int afid; unsigned int cache; - char *options; /* copy of mount options */ char *uname; /* user name to mount as */ char *aname; /* name of remote hierarchy being mounted */ unsigned int maxdata; /* max data for client interface */ diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 6fcb1e7095cf..92828281a30b 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -57,7 +57,7 @@ static int v9fs_vfs_readpage(struct file *filp, struct page *page) buffer = kmap(page); offset = page_offset(page); - retval = v9fs_file_readn(filp, buffer, NULL, offset, PAGE_CACHE_SIZE); + retval = v9fs_file_readn(filp, buffer, NULL, PAGE_CACHE_SIZE, offset); if (retval < 0) goto done; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 81f8bbf12f9f..06a223d50a81 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -171,7 +171,6 @@ int v9fs_uflags2omode(int uflags, int extended) /** * v9fs_blank_wstat - helper function to setup a 9P stat structure - * @v9ses: 9P session info (for determining extended mode) * @wstat: structure to initialize * */ @@ -207,65 +206,72 @@ v9fs_blank_wstat(struct p9_wstat *wstat) struct inode *v9fs_get_inode(struct super_block *sb, int mode) { + int err; struct inode *inode; struct v9fs_session_info *v9ses = sb->s_fs_info; P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode); inode = new_inode(sb); - if (inode) { - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); - inode->i_blocks = 0; - inode->i_rdev = 0; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - inode->i_mapping->a_ops = &v9fs_addr_operations; - - switch (mode & S_IFMT) { - case S_IFIFO: - case S_IFBLK: - case S_IFCHR: - case S_IFSOCK: - if (!v9fs_extended(v9ses)) { - P9_DPRINTK(P9_DEBUG_ERROR, - "special files without extended mode\n"); - return ERR_PTR(-EINVAL); - } - init_special_inode(inode, inode->i_mode, - inode->i_rdev); - break; - case S_IFREG: - inode->i_op = &v9fs_file_inode_operations; - inode->i_fop = &v9fs_file_operations; - break; - case S_IFLNK: - if (!v9fs_extended(v9ses)) { - P9_DPRINTK(P9_DEBUG_ERROR, - "extended modes used w/o 9P2000.u\n"); - return ERR_PTR(-EINVAL); - } - inode->i_op = &v9fs_symlink_inode_operations; - break; - case S_IFDIR: - inc_nlink(inode); - if (v9fs_extended(v9ses)) - inode->i_op = &v9fs_dir_inode_operations_ext; - else - inode->i_op = &v9fs_dir_inode_operations; - inode->i_fop = &v9fs_dir_operations; - break; - default: - P9_DPRINTK(P9_DEBUG_ERROR, - "BAD mode 0x%x S_IFMT 0x%x\n", - mode, mode & S_IFMT); - return ERR_PTR(-EINVAL); - } - } else { + if (!inode) { P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n"); return ERR_PTR(-ENOMEM); } + + inode->i_mode = mode; + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); + inode->i_blocks = 0; + inode->i_rdev = 0; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mapping->a_ops = &v9fs_addr_operations; + + switch (mode & S_IFMT) { + case S_IFIFO: + case S_IFBLK: + case S_IFCHR: + case S_IFSOCK: + if (!v9fs_extended(v9ses)) { + P9_DPRINTK(P9_DEBUG_ERROR, + "special files without extended mode\n"); + err = -EINVAL; + goto error; + } + init_special_inode(inode, inode->i_mode, inode->i_rdev); + break; + case S_IFREG: + inode->i_op = &v9fs_file_inode_operations; + inode->i_fop = &v9fs_file_operations; + break; + case S_IFLNK: + if (!v9fs_extended(v9ses)) { + P9_DPRINTK(P9_DEBUG_ERROR, + "extended modes used w/o 9P2000.u\n"); + err = -EINVAL; + goto error; + } + inode->i_op = &v9fs_symlink_inode_operations; + break; + case S_IFDIR: + inc_nlink(inode); + if (v9fs_extended(v9ses)) + inode->i_op = &v9fs_dir_inode_operations_ext; + else + inode->i_op = &v9fs_dir_inode_operations; + inode->i_fop = &v9fs_dir_operations; + break; + default: + P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n", + mode, mode & S_IFMT); + err = -EINVAL; + goto error; + } + return inode; + +error: + iput(inode); + return ERR_PTR(err); } /* @@ -338,30 +344,25 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, ret = NULL; st = p9_client_stat(fid); - if (IS_ERR(st)) { - err = PTR_ERR(st); - st = NULL; - goto error; - } + if (IS_ERR(st)) + return ERR_CAST(st); umode = p9mode2unixmode(v9ses, st->mode); ret = v9fs_get_inode(sb, umode); if (IS_ERR(ret)) { err = PTR_ERR(ret); - ret = NULL; goto error; } v9fs_stat2inode(st, ret, sb); ret->i_ino = v9fs_qid2ino(&st->qid); + p9stat_free(st); kfree(st); return ret; error: + p9stat_free(st); kfree(st); - if (ret) - iput(ret); - return ERR_PTR(err); } @@ -403,9 +404,9 @@ v9fs_open_created(struct inode *inode, struct file *file) * @v9ses: session information * @dir: directory that dentry is being created in * @dentry: dentry that is being created + * @extension: 9p2000.u extension string to support devices, etc. * @perm: create permissions * @mode: open mode - * @extension: 9p2000.u extension string to support devices, etc. * */ static struct p9_fid * @@ -470,7 +471,10 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, dentry->d_op = &v9fs_dentry_operations; d_instantiate(dentry, inode); - v9fs_fid_add(dentry, fid); + err = v9fs_fid_add(dentry, fid); + if (err < 0) + goto error; + return ofid; error: diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 38d695d66a0b..8961f1a8f668 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -81,7 +81,7 @@ static int v9fs_set_super(struct super_block *s, void *data) static void v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, - int flags) + int flags, void *data) { sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize_bits = fls(v9ses->maxdata - 1); @@ -91,6 +91,8 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC | MS_NOATIME; + + save_mount_options(sb, data); } /** @@ -113,14 +115,11 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, struct v9fs_session_info *v9ses = NULL; struct p9_wstat *st = NULL; int mode = S_IRWXUGO | S_ISVTX; - uid_t uid = current_fsuid(); - gid_t gid = current_fsgid(); struct p9_fid *fid; int retval = 0; P9_DPRINTK(P9_DEBUG_VFS, " \n"); - st = NULL; v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL); if (!v9ses) return -ENOMEM; @@ -142,7 +141,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, retval = PTR_ERR(sb); goto free_stat; } - v9fs_fill_super(sb, v9ses, flags); + v9fs_fill_super(sb, v9ses, flags, data); inode = v9fs_get_inode(sb, S_IFDIR | mode); if (IS_ERR(inode)) { @@ -150,9 +149,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, goto release_sb; } - inode->i_uid = uid; - inode->i_gid = gid; - root = d_alloc_root(inode); if (!root) { iput(inode); @@ -173,10 +169,8 @@ P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n"); simple_set_mnt(mnt, sb); return 0; -release_sb: - deactivate_locked_super(sb); - free_stat: + p9stat_free(st); kfree(st); clunk_fid: @@ -185,7 +179,12 @@ clunk_fid: close_session: v9fs_session_close(v9ses); kfree(v9ses); + return retval; +release_sb: + p9stat_free(st); + kfree(st); + deactivate_locked_super(sb); return retval; } @@ -207,24 +206,10 @@ static void v9fs_kill_super(struct super_block *s) v9fs_session_close(v9ses); kfree(v9ses); + s->s_fs_info = NULL; P9_DPRINTK(P9_DEBUG_VFS, "exiting kill_super\n"); } -/** - * v9fs_show_options - Show mount options in /proc/mounts - * @m: seq_file to write to - * @mnt: mount descriptor - * - */ - -static int v9fs_show_options(struct seq_file *m, struct vfsmount *mnt) -{ - struct v9fs_session_info *v9ses = mnt->mnt_sb->s_fs_info; - - seq_printf(m, "%s", v9ses->options); - return 0; -} - static void v9fs_umount_begin(struct super_block *sb) { @@ -237,7 +222,7 @@ v9fs_umount_begin(struct super_block *sb) static const struct super_operations v9fs_super_ops = { .statfs = simple_statfs, .clear_inode = v9fs_clear_inode, - .show_options = v9fs_show_options, + .show_options = generic_show_options, .umount_begin = v9fs_umount_begin, }; diff --git a/fs/Kconfig b/fs/Kconfig index a97263be6a91..455aa207e67e 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -43,6 +43,7 @@ source "fs/xfs/Kconfig" source "fs/gfs2/Kconfig" source "fs/ocfs2/Kconfig" source "fs/btrfs/Kconfig" +source "fs/nilfs2/Kconfig" endif # BLOCK @@ -187,32 +188,6 @@ source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" source "fs/exofs/Kconfig" -config NILFS2_FS - tristate "NILFS2 file system support (EXPERIMENTAL)" - depends on BLOCK && EXPERIMENTAL - select CRC32 - help - NILFS2 is a log-structured file system (LFS) supporting continuous - snapshotting. In addition to versioning capability of the entire - file system, users can even restore files mistakenly overwritten or - destroyed just a few seconds ago. Since this file system can keep - consistency like conventional LFS, it achieves quick recovery after - system crashes. - - NILFS2 creates a number of checkpoints every few seconds or per - synchronous write basis (unless there is no change). Users can - select significant versions among continuously created checkpoints, - and can change them into snapshots which will be preserved for long - periods until they are changed back to checkpoints. Each - snapshot is mountable as a read-only file system concurrently with - its writable mount, and this feature is convenient for online backup. - - Some features including atime, extended attributes, and POSIX ACLs, - are not supported yet. - - To compile this file system support as a module, choose M here: the - module will be called nilfs2. If unsure, say N. - endif # MISC_FILESYSTEMS menuconfig NETWORK_FILESYSTEMS diff --git a/fs/adfs/super.c b/fs/adfs/super.c index aad92f0a1048..6910a98bd73c 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -13,6 +13,7 @@ #include <linux/parser.h> #include <linux/mount.h> #include <linux/seq_file.h> +#include <linux/smp_lock.h> #include <linux/statfs.h> #include "adfs.h" #include "dir_f.h" diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 9bd757774c9e..88067f36e5e7 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -564,7 +564,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd) { struct afs_vnode *vnode, *dir; - struct afs_fid fid; + struct afs_fid uninitialized_var(fid); struct dentry *parent; struct key *key; void *dir_version; diff --git a/fs/afs/file.c b/fs/afs/file.c index 0149dab365e7..681c2a7b013f 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -134,9 +134,16 @@ static int afs_readpage(struct file *file, struct page *page) inode = page->mapping->host; - ASSERT(file != NULL); - key = file->private_data; - ASSERT(key != NULL); + if (file) { + key = file->private_data; + ASSERT(key != NULL); + } else { + key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + goto error_nokey; + } + } _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index); @@ -207,12 +214,17 @@ static int afs_readpage(struct file *file, struct page *page) unlock_page(page); } + if (!file) + key_put(key); _leave(" = 0"); return 0; error: SetPageError(page); unlock_page(page); + if (!file) + key_put(key); +error_nokey: _leave(" = %d", ret); return ret; } diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index c52be53f6946..5ffb570cd3a8 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -17,7 +17,6 @@ #include <linux/pagemap.h> #include <linux/mount.h> #include <linux/namei.h> -#include <linux/mnt_namespace.h> #include "internal.h" diff --git a/fs/afs/super.c b/fs/afs/super.c index ad0514d0115f..e1ea1c240b6a 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -18,6 +18,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> +#include <linux/smp_lock.h> #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/parser.h> diff --git a/fs/afs/write.c b/fs/afs/write.c index c2e7a7ff0080..c63a3c8beb73 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -712,7 +712,6 @@ int afs_writeback_all(struct afs_vnode *vnode) .bdi = mapping->backing_dev_info, .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, - .for_writepages = 1, .range_cyclic = 1, }; int ret; diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index f3da2eb51f56..00bf8fcb245f 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -19,7 +19,6 @@ #include <linux/sched.h> #include <linux/compat.h> #include <linux/syscalls.h> -#include <linux/smp_lock.h> #include <linux/magic.h> #include <linux/dcache.h> #include <linux/uaccess.h> diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index aa39ae83f019..3da18d453488 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -77,7 +77,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) } /* Update the expiry counter if fs is busy */ - if (!may_umount_tree(mnt)) { + if (!may_umount_tree(path.mnt)) { struct autofs_info *ino = autofs4_dentry_ino(top); ino->last_used = jiffies; goto done; diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 54bd07d44e68..1e41aadb1068 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -8,7 +8,6 @@ #include <linux/time.h> #include <linux/string.h> #include <linux/fs.h> -#include <linux/smp_lock.h> #include <linux/buffer_head.h> #include <linux/sched.h> #include "bfs.h" diff --git a/fs/bfs/file.c b/fs/bfs/file.c index 6a021265f018..88b9a3ff44e4 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -11,7 +11,6 @@ #include <linux/fs.h> #include <linux/buffer_head.h> -#include <linux/smp_lock.h> #include "bfs.h" #undef DEBUG diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index b7c1603cd4bd..7c1e65d54872 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -501,22 +501,22 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, } } - /* - * Now fill out the bss section. First pad the last page up - * to the page boundary, and then perform a mmap to make sure - * that there are zero-mapped pages up to and including the - * last bss page. - */ - if (padzero(elf_bss)) { - error = -EFAULT; - goto out_close; - } + if (last_bss > elf_bss) { + /* + * Now fill out the bss section. First pad the last page up + * to the page boundary, and then perform a mmap to make sure + * that there are zero-mapped pages up to and including the + * last bss page. + */ + if (padzero(elf_bss)) { + error = -EFAULT; + goto out_close; + } - /* What we have mapped so far */ - elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1); + /* What we have mapped so far */ + elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1); - /* Map the last of the bss segment */ - if (last_bss > elf_bss) { + /* Map the last of the bss segment */ down_write(¤t->mm->mmap_sem); error = do_brk(elf_bss, last_bss - elf_bss); up_write(¤t->mm->mmap_sem); diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 697f6b5f1313..e92f229e3c6e 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -828,15 +828,22 @@ static int load_flat_shared_library(int id, struct lib_info *libs) if (IS_ERR(bprm.file)) return res; + bprm.cred = prepare_exec_creds(); + res = -ENOMEM; + if (!bprm.cred) + goto out; + res = prepare_binprm(&bprm); if (res <= (unsigned long)-4096) res = load_flat_file(&bprm, libs, id, NULL); - if (bprm.file) { - allow_write_access(bprm.file); - fput(bprm.file); - bprm.file = NULL; - } + + abort_creds(bprm.cred); + +out: + allow_write_access(bprm.file); + fput(bprm.file); + return(res); } @@ -705,14 +705,13 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count, } static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs, - struct sg_iovec *iov, int iov_count, int uncopy, - int do_free_page) + struct sg_iovec *iov, int iov_count, + int to_user, int from_user, int do_free_page) { int ret = 0, i; struct bio_vec *bvec; int iov_idx = 0; unsigned int iov_off = 0; - int read = bio_data_dir(bio) == READ; __bio_for_each_segment(bvec, bio, i, 0) { char *bv_addr = page_address(bvec->bv_page); @@ -727,13 +726,14 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs, iov_addr = iov[iov_idx].iov_base + iov_off; if (!ret) { - if (!read && !uncopy) - ret = copy_from_user(bv_addr, iov_addr, - bytes); - if (read && uncopy) + if (to_user) ret = copy_to_user(iov_addr, bv_addr, bytes); + if (from_user) + ret = copy_from_user(bv_addr, iov_addr, + bytes); + if (ret) ret = -EFAULT; } @@ -770,7 +770,8 @@ int bio_uncopy_user(struct bio *bio) if (!bio_flagged(bio, BIO_NULL_MAPPED)) ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs, - bmd->nr_sgvecs, 1, bmd->is_our_pages); + bmd->nr_sgvecs, bio_data_dir(bio) == READ, + 0, bmd->is_our_pages); bio_free_map_data(bmd); bio_put(bio); return ret; @@ -875,8 +876,9 @@ struct bio *bio_copy_user_iov(struct request_queue *q, /* * success */ - if (!write_to_vm && (!map_data || !map_data->null_mapped)) { - ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0); + if ((!write_to_vm && (!map_data || !map_data->null_mapped)) || + (map_data && map_data->from_user)) { + ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 1, 0); if (ret) goto cleanup; } diff --git a/fs/block_dev.c b/fs/block_dev.c index 3a6d4fb2a329..71e7e03ac343 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -420,7 +420,6 @@ static void bdev_destroy_inode(struct inode *inode) { struct bdev_inode *bdi = BDEV_I(inode); - bdi->bdev.bd_inode_backing_dev_info = NULL; kmem_cache_free(bdev_cachep, bdi); } @@ -564,6 +563,16 @@ struct block_device *bdget(dev_t dev) EXPORT_SYMBOL(bdget); +/** + * bdgrab -- Grab a reference to an already referenced block device + * @bdev: Block device to grab a reference to. + */ +struct block_device *bdgrab(struct block_device *bdev) +{ + atomic_inc(&bdev->bd_inode->i_count); + return bdev; +} + long nr_blockdev_pages(void) { struct block_device *bdev; @@ -1395,6 +1404,33 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) } /* + * Write data to the block device. Only intended for the block device itself + * and the raw driver which basically is a fake block device. + * + * Does not take i_mutex for the write and thus is not for general purpose + * use. + */ +ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct file *file = iocb->ki_filp; + ssize_t ret; + + BUG_ON(iocb->ki_pos != pos); + + ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); + if (ret > 0 || ret == -EIOCBQUEUED) { + ssize_t err; + + err = generic_write_sync(file, pos, ret); + if (err < 0 && ret > 0) + ret = err; + } + return ret; +} +EXPORT_SYMBOL_GPL(blkdev_aio_write); + +/* * Try to release a page associated with block device when the system * is under memory pressure. */ @@ -1426,7 +1462,7 @@ const struct file_operations def_blk_fops = { .read = do_sync_read, .write = do_sync_write, .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write_nolock, + .aio_write = blkdev_aio_write, .mmap = generic_file_mmap, .fsync = block_fsync, .unlocked_ioctl = block_ioctl, diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 6e4f6c50a120..019e8af449ab 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -424,11 +424,11 @@ int btrfs_requeue_work(struct btrfs_work *work) * list */ if (worker->idle) { - spin_lock_irqsave(&worker->workers->lock, flags); + spin_lock(&worker->workers->lock); worker->idle = 0; list_move_tail(&worker->worker_list, &worker->workers->worker_list); - spin_unlock_irqrestore(&worker->workers->lock, flags); + spin_unlock(&worker->workers->lock); } if (!worker->working) { wake = 1; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index de1e2fd32080..9d8ba4d54a37 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -26,7 +26,6 @@ #include <linux/time.h> #include <linux/init.h> #include <linux/string.h> -#include <linux/smp_lock.h> #include <linux/backing-dev.h> #include <linux/mpage.h> #include <linux/swap.h> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 60a45f3a4e91..3fdcc0512d3a 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -557,19 +557,7 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2) btrfs_disk_key_to_cpu(&k1, disk); - if (k1.objectid > k2->objectid) - return 1; - if (k1.objectid < k2->objectid) - return -1; - if (k1.type > k2->type) - return 1; - if (k1.type < k2->type) - return -1; - if (k1.offset > k2->offset) - return 1; - if (k1.offset < k2->offset) - return -1; - return 0; + return btrfs_comp_cpu_keys(&k1, k2); } /* @@ -1052,9 +1040,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_NODEPTRS_PER_BLOCK(root) / 4) return 0; - if (btrfs_header_nritems(mid) > 2) - return 0; - if (btrfs_header_nritems(mid) < 2) err_on_enospc = 1; @@ -1701,6 +1686,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root struct extent_buffer *b; int slot; int ret; + int err; int level; int lowest_unlock = 1; u8 lowest_level = 0; @@ -1737,8 +1723,6 @@ again: p->locks[level] = 1; if (cow) { - int wret; - /* * if we don't really need to cow this block * then we don't want to set the path blocking, @@ -1749,12 +1733,12 @@ again: btrfs_set_path_blocking(p); - wret = btrfs_cow_block(trans, root, b, - p->nodes[level + 1], - p->slots[level + 1], &b); - if (wret) { + err = btrfs_cow_block(trans, root, b, + p->nodes[level + 1], + p->slots[level + 1], &b); + if (err) { free_extent_buffer(b); - ret = wret; + ret = err; goto done; } } @@ -1793,41 +1777,45 @@ cow_done: ret = bin_search(b, key, level, &slot); if (level != 0) { - if (ret && slot > 0) + int dec = 0; + if (ret && slot > 0) { + dec = 1; slot -= 1; + } p->slots[level] = slot; - ret = setup_nodes_for_search(trans, root, p, b, level, + err = setup_nodes_for_search(trans, root, p, b, level, ins_len); - if (ret == -EAGAIN) + if (err == -EAGAIN) goto again; - else if (ret) + if (err) { + ret = err; goto done; + } b = p->nodes[level]; slot = p->slots[level]; unlock_up(p, level, lowest_unlock); - /* this is only true while dropping a snapshot */ if (level == lowest_level) { - ret = 0; + if (dec) + p->slots[level]++; goto done; } - ret = read_block_for_search(trans, root, p, + err = read_block_for_search(trans, root, p, &b, level, slot, key); - if (ret == -EAGAIN) + if (err == -EAGAIN) goto again; - - if (ret == -EIO) + if (err) { + ret = err; goto done; + } if (!p->skip_locking) { - int lret; - btrfs_clear_path_blocking(p, NULL); - lret = btrfs_try_spin_lock(b); + err = btrfs_try_spin_lock(b); - if (!lret) { + if (!err) { btrfs_set_path_blocking(p); btrfs_tree_lock(b); btrfs_clear_path_blocking(p, b); @@ -1837,16 +1825,14 @@ cow_done: p->slots[level] = slot; if (ins_len > 0 && btrfs_leaf_free_space(root, b) < ins_len) { - int sret; - btrfs_set_path_blocking(p); - sret = split_leaf(trans, root, key, - p, ins_len, ret == 0); + err = split_leaf(trans, root, key, + p, ins_len, ret == 0); btrfs_clear_path_blocking(p, NULL); - BUG_ON(sret > 0); - if (sret) { - ret = sret; + BUG_ON(err > 0); + if (err) { + ret = err; goto done; } } @@ -3807,7 +3793,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, } /* delete the leaf if it is mostly empty */ - if (used < BTRFS_LEAF_DATA_SIZE(root) / 2) { + if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) { /* push_leaf_left fixes the path. * make sure the path still points to our leaf * for possible call to del_ptr below @@ -4042,10 +4028,9 @@ out: * calling this function. */ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *key, int lowest_level, + struct btrfs_key *key, int level, int cache_only, u64 min_trans) { - int level = lowest_level; int slot; struct extent_buffer *c; @@ -4058,11 +4043,40 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, c = path->nodes[level]; next: if (slot >= btrfs_header_nritems(c)) { - level++; - if (level == BTRFS_MAX_LEVEL) + int ret; + int orig_lowest; + struct btrfs_key cur_key; + if (level + 1 >= BTRFS_MAX_LEVEL || + !path->nodes[level + 1]) return 1; - continue; + + if (path->locks[level + 1]) { + level++; + continue; + } + + slot = btrfs_header_nritems(c) - 1; + if (level == 0) + btrfs_item_key_to_cpu(c, &cur_key, slot); + else + btrfs_node_key_to_cpu(c, &cur_key, slot); + + orig_lowest = path->lowest_level; + btrfs_release_path(root, path); + path->lowest_level = level; + ret = btrfs_search_slot(NULL, root, &cur_key, path, + 0, 0); + path->lowest_level = orig_lowest; + if (ret < 0) + return ret; + + c = path->nodes[level]; + slot = path->slots[level]; + if (ret == 0) + slot++; + goto next; } + if (level == 0) btrfs_item_key_to_cpu(c, key, slot); else { @@ -4146,7 +4160,8 @@ again: * advance the path if there are now more items available. */ if (nritems > 0 && path->slots[0] < nritems - 1) { - path->slots[0]++; + if (ret == 0) + path->slots[0]++; ret = 0; goto done; } @@ -4278,10 +4293,10 @@ int btrfs_previous_item(struct btrfs_root *root, path->slots[0]--; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (found_key.type == type) - return 0; if (found_key.objectid < min_objectid) break; + if (found_key.type == type) + return 0; if (found_key.objectid == min_objectid && found_key.type < type) break; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 98a873838717..837435ce84ca 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -481,7 +481,7 @@ struct btrfs_shared_data_ref { struct btrfs_extent_inline_ref { u8 type; - u64 offset; + __le64 offset; } __attribute__ ((__packed__)); /* old style backrefs item */ @@ -689,6 +689,7 @@ struct btrfs_space_info { struct list_head block_groups; spinlock_t lock; struct rw_semaphore groups_sem; + atomic_t caching_threads; }; /* @@ -707,6 +708,9 @@ struct btrfs_free_cluster { /* first extent starting offset */ u64 window_start; + /* if this cluster simply points at a bitmap in the block group */ + bool points_to_bitmap; + struct btrfs_block_group_cache *block_group; /* * when a cluster is allocated from a block group, we put the @@ -716,24 +720,37 @@ struct btrfs_free_cluster { struct list_head block_group_list; }; +enum btrfs_caching_type { + BTRFS_CACHE_NO = 0, + BTRFS_CACHE_STARTED = 1, + BTRFS_CACHE_FINISHED = 2, +}; + struct btrfs_block_group_cache { struct btrfs_key key; struct btrfs_block_group_item item; + struct btrfs_fs_info *fs_info; spinlock_t lock; - struct mutex cache_mutex; u64 pinned; u64 reserved; u64 flags; - int cached; + u64 sectorsize; + int extents_thresh; + int free_extents; + int total_bitmaps; int ro; int dirty; + /* cache tracking stuff */ + wait_queue_head_t caching_q; + int cached; + struct btrfs_space_info *space_info; /* free space cache stuff */ spinlock_t tree_lock; - struct rb_root free_space_bytes; struct rb_root free_space_offset; + u64 free_space; /* block group cache stuff */ struct rb_node cache_node; @@ -808,6 +825,7 @@ struct btrfs_fs_info { struct mutex drop_mutex; struct mutex volume_mutex; struct mutex tree_reloc_mutex; + struct rw_semaphore extent_commit_sem; /* * this protects the ordered operations list only while we are @@ -1988,6 +2006,7 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, u64 bytes); void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, u64 bytes); +void btrfs_free_pinned_extents(struct btrfs_fs_info *info); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d28d29c95f7c..8b8192790011 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1352,6 +1352,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) { int err; + bdi->name = "btrfs"; bdi->capabilities = BDI_CAP_MAP_COPY; err = bdi_init(bdi); if (err) @@ -1599,6 +1600,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, sb->s_blocksize = 4096; sb->s_blocksize_bits = blksize_bits(4096); + sb->s_bdi = &fs_info->bdi; /* * we set the i_size on the btree inode to the max possible int. @@ -1639,6 +1641,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, mutex_init(&fs_info->cleaner_mutex); mutex_init(&fs_info->volume_mutex); mutex_init(&fs_info->tree_reloc_mutex); + init_rwsem(&fs_info->extent_commit_sem); btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); btrfs_init_free_cluster(&fs_info->data_alloc_cluster); @@ -1799,6 +1802,11 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_super_chunk_root(disk_super), blocksize, generation); BUG_ON(!chunk_root->node); + if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { + printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", + sb->s_id); + goto fail_chunk_root; + } btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); chunk_root->commit_root = btrfs_root_node(chunk_root); @@ -1826,6 +1834,11 @@ struct btrfs_root *open_ctree(struct super_block *sb, blocksize, generation); if (!tree_root->node) goto fail_chunk_root; + if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { + printk(KERN_WARNING "btrfs: failed to read tree root on %s\n", + sb->s_id); + goto fail_tree_root; + } btrfs_set_root_node(&tree_root->root_item, tree_root->node); tree_root->commit_root = btrfs_root_node(tree_root); @@ -2322,6 +2335,9 @@ int close_ctree(struct btrfs_root *root) printk(KERN_ERR "btrfs: commit super ret %d\n", ret); } + fs_info->closing = 2; + smp_mb(); + if (fs_info->delalloc_bytes) { printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", (unsigned long long)fs_info->delalloc_bytes); @@ -2343,6 +2359,7 @@ int close_ctree(struct btrfs_root *root) free_extent_buffer(root->fs_info->csum_root->commit_root); btrfs_free_block_groups(root->fs_info); + btrfs_free_pinned_extents(root->fs_info); del_fs_roots(fs_info); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a5aca3997d42..535f85ba104f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -21,6 +21,7 @@ #include <linux/blkdev.h> #include <linux/sort.h> #include <linux/rcupdate.h> +#include <linux/kthread.h> #include "compat.h" #include "hash.h" #include "ctree.h" @@ -61,6 +62,13 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, u64 flags, int force); +static noinline int +block_group_cache_done(struct btrfs_block_group_cache *cache) +{ + smp_mb(); + return cache->cached == BTRFS_CACHE_FINISHED; +} + static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) { return (cache->flags & bits) == bits; @@ -146,20 +154,70 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, } /* + * We always set EXTENT_LOCKED for the super mirror extents so we don't + * overwrite them, so those bits need to be unset. Also, if we are unmounting + * with pinned extents still sitting there because we had a block group caching, + * we need to clear those now, since we are done. + */ +void btrfs_free_pinned_extents(struct btrfs_fs_info *info) +{ + u64 start, end, last = 0; + int ret; + + while (1) { + ret = find_first_extent_bit(&info->pinned_extents, last, + &start, &end, + EXTENT_LOCKED|EXTENT_DIRTY); + if (ret) + break; + + clear_extent_bits(&info->pinned_extents, start, end, + EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS); + last = end+1; + } +} + +static int remove_sb_from_cache(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + u64 bytenr; + u64 *logical; + int stripe_len; + int i, nr, ret; + + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + bytenr = btrfs_sb_offset(i); + ret = btrfs_rmap_block(&root->fs_info->mapping_tree, + cache->key.objectid, bytenr, + 0, &logical, &nr, &stripe_len); + BUG_ON(ret); + while (nr--) { + try_lock_extent(&fs_info->pinned_extents, + logical[nr], + logical[nr] + stripe_len - 1, GFP_NOFS); + } + kfree(logical); + } + + return 0; +} + +/* * this is only called by cache_block_group, since we could have freed extents * we need to check the pinned_extents for any extents that can't be used yet * since their free space will be released as soon as the transaction commits. */ -static int add_new_free_space(struct btrfs_block_group_cache *block_group, +static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, struct btrfs_fs_info *info, u64 start, u64 end) { - u64 extent_start, extent_end, size; + u64 extent_start, extent_end, size, total_added = 0; int ret; while (start < end) { ret = find_first_extent_bit(&info->pinned_extents, start, &extent_start, &extent_end, - EXTENT_DIRTY); + EXTENT_DIRTY|EXTENT_LOCKED); if (ret) break; @@ -167,6 +225,7 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group, start = extent_end + 1; } else if (extent_start > start && extent_start < end) { size = extent_start - start; + total_added += size; ret = btrfs_add_free_space(block_group, start, size); BUG_ON(ret); @@ -178,84 +237,93 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group, if (start < end) { size = end - start; + total_added += size; ret = btrfs_add_free_space(block_group, start, size); BUG_ON(ret); } - return 0; -} - -static int remove_sb_from_cache(struct btrfs_root *root, - struct btrfs_block_group_cache *cache) -{ - u64 bytenr; - u64 *logical; - int stripe_len; - int i, nr, ret; - - for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { - bytenr = btrfs_sb_offset(i); - ret = btrfs_rmap_block(&root->fs_info->mapping_tree, - cache->key.objectid, bytenr, 0, - &logical, &nr, &stripe_len); - BUG_ON(ret); - while (nr--) { - btrfs_remove_free_space(cache, logical[nr], - stripe_len); - } - kfree(logical); - } - return 0; + return total_added; } -static int cache_block_group(struct btrfs_root *root, - struct btrfs_block_group_cache *block_group) +static int caching_kthread(void *data) { + struct btrfs_block_group_cache *block_group = data; + struct btrfs_fs_info *fs_info = block_group->fs_info; + u64 last = 0; struct btrfs_path *path; int ret = 0; struct btrfs_key key; struct extent_buffer *leaf; int slot; - u64 last; - - if (!block_group) - return 0; - - root = root->fs_info->extent_root; + u64 total_found = 0; - if (block_group->cached) - return 0; + BUG_ON(!fs_info); path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 2; + atomic_inc(&block_group->space_info->caching_threads); + last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); /* - * we get into deadlocks with paths held by callers of this function. - * since the alloc_mutex is protecting things right now, just - * skip the locking here + * We don't want to deadlock with somebody trying to allocate a new + * extent for the extent root while also trying to search the extent + * root to add free space. So we skip locking and search the commit + * root, since its read-only */ path->skip_locking = 1; - last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); + path->search_commit_root = 1; + path->reada = 2; + key.objectid = last; key.offset = 0; btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); +again: + /* need to make sure the commit_root doesn't disappear */ + down_read(&fs_info->extent_commit_sem); + + ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); if (ret < 0) goto err; while (1) { + smp_mb(); + if (block_group->fs_info->closing > 1) { + last = (u64)-1; + break; + } + leaf = path->nodes[0]; slot = path->slots[0]; if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); + ret = btrfs_next_leaf(fs_info->extent_root, path); if (ret < 0) goto err; - if (ret == 0) - continue; - else + else if (ret) break; + + if (need_resched() || + btrfs_transaction_in_commit(fs_info)) { + leaf = path->nodes[0]; + + /* this shouldn't happen, but if the + * leaf is empty just move on. + */ + if (btrfs_header_nritems(leaf) == 0) + break; + /* + * we need to copy the key out so that + * we are sure the next search advances + * us forward in the btree. + */ + btrfs_item_key_to_cpu(leaf, &key, 0); + btrfs_release_path(fs_info->extent_root, path); + up_read(&fs_info->extent_commit_sem); + schedule_timeout(1); + goto again; + } + + continue; } btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid < block_group->key.objectid) @@ -266,24 +334,59 @@ static int cache_block_group(struct btrfs_root *root, break; if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) { - add_new_free_space(block_group, root->fs_info, last, - key.objectid); - + total_found += add_new_free_space(block_group, + fs_info, last, + key.objectid); last = key.objectid + key.offset; } + + if (total_found > (1024 * 1024 * 2)) { + total_found = 0; + wake_up(&block_group->caching_q); + } next: path->slots[0]++; } + ret = 0; - add_new_free_space(block_group, root->fs_info, last, - block_group->key.objectid + - block_group->key.offset); + total_found += add_new_free_space(block_group, fs_info, last, + block_group->key.objectid + + block_group->key.offset); + + spin_lock(&block_group->lock); + block_group->cached = BTRFS_CACHE_FINISHED; + spin_unlock(&block_group->lock); - block_group->cached = 1; - remove_sb_from_cache(root, block_group); - ret = 0; err: btrfs_free_path(path); + up_read(&fs_info->extent_commit_sem); + atomic_dec(&block_group->space_info->caching_threads); + wake_up(&block_group->caching_q); + + return 0; +} + +static int cache_block_group(struct btrfs_block_group_cache *cache) +{ + struct task_struct *tsk; + int ret = 0; + + spin_lock(&cache->lock); + if (cache->cached != BTRFS_CACHE_NO) { + spin_unlock(&cache->lock); + return ret; + } + cache->cached = BTRFS_CACHE_STARTED; + spin_unlock(&cache->lock); + + tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n", + cache->key.objectid); + if (IS_ERR(tsk)) { + ret = PTR_ERR(tsk); + printk(KERN_ERR "error running thread %d\n", ret); + BUG(); + } + return ret; } @@ -1408,7 +1511,8 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, static void btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len) { - blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); + blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, + DISCARD_FL_BARRIER); } #endif @@ -2387,13 +2491,29 @@ fail: } +static struct btrfs_block_group_cache * +next_block_group(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) +{ + struct rb_node *node; + spin_lock(&root->fs_info->block_group_cache_lock); + node = rb_next(&cache->cache_node); + btrfs_put_block_group(cache); + if (node) { + cache = rb_entry(node, struct btrfs_block_group_cache, + cache_node); + atomic_inc(&cache->count); + } else + cache = NULL; + spin_unlock(&root->fs_info->block_group_cache_lock); + return cache; +} + int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - struct btrfs_block_group_cache *cache, *entry; - struct rb_node *n; + struct btrfs_block_group_cache *cache; int err = 0; - int werr = 0; struct btrfs_path *path; u64 last = 0; @@ -2402,39 +2522,35 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, return -ENOMEM; while (1) { - cache = NULL; - spin_lock(&root->fs_info->block_group_cache_lock); - for (n = rb_first(&root->fs_info->block_group_cache_tree); - n; n = rb_next(n)) { - entry = rb_entry(n, struct btrfs_block_group_cache, - cache_node); - if (entry->dirty) { - cache = entry; - break; - } + if (last == 0) { + err = btrfs_run_delayed_refs(trans, root, + (unsigned long)-1); + BUG_ON(err); } - spin_unlock(&root->fs_info->block_group_cache_lock); - if (!cache) - break; + cache = btrfs_lookup_first_block_group(root->fs_info, last); + while (cache) { + if (cache->dirty) + break; + cache = next_block_group(root, cache); + } + if (!cache) { + if (last == 0) + break; + last = 0; + continue; + } cache->dirty = 0; - last += cache->key.offset; + last = cache->key.objectid + cache->key.offset; - err = write_one_cache_group(trans, root, - path, cache); - /* - * if we fail to write the cache group, we want - * to keep it marked dirty in hopes that a later - * write will work - */ - if (err) { - werr = err; - continue; - } + err = write_one_cache_group(trans, root, path, cache); + BUG_ON(err); + btrfs_put_block_group(cache); } + btrfs_free_path(path); - return werr; + return 0; } int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) @@ -2484,6 +2600,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->force_alloc = 0; *space_info = found; list_add_rcu(&found->list, &info->space_info); + atomic_set(&found->caching_threads, 0); return 0; } @@ -2947,13 +3064,9 @@ int btrfs_update_pinned_extents(struct btrfs_root *root, struct btrfs_block_group_cache *cache; struct btrfs_fs_info *fs_info = root->fs_info; - if (pin) { + if (pin) set_extent_dirty(&fs_info->pinned_extents, bytenr, bytenr + num - 1, GFP_NOFS); - } else { - clear_extent_dirty(&fs_info->pinned_extents, - bytenr, bytenr + num - 1, GFP_NOFS); - } while (num > 0) { cache = btrfs_lookup_block_group(fs_info, bytenr); @@ -2969,14 +3082,34 @@ int btrfs_update_pinned_extents(struct btrfs_root *root, spin_unlock(&cache->space_info->lock); fs_info->total_pinned += len; } else { + int unpin = 0; + + /* + * in order to not race with the block group caching, we + * only want to unpin the extent if we are cached. If + * we aren't cached, we want to start async caching this + * block group so we can free the extent the next time + * around. + */ spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); - cache->pinned -= len; - cache->space_info->bytes_pinned -= len; + unpin = (cache->cached == BTRFS_CACHE_FINISHED); + if (likely(unpin)) { + cache->pinned -= len; + cache->space_info->bytes_pinned -= len; + fs_info->total_pinned -= len; + } spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - fs_info->total_pinned -= len; - if (cache->cached) + + if (likely(unpin)) + clear_extent_dirty(&fs_info->pinned_extents, + bytenr, bytenr + len -1, + GFP_NOFS); + else + cache_block_group(cache); + + if (unpin) btrfs_add_free_space(cache, bytenr, len); } btrfs_put_block_group(cache); @@ -3030,6 +3163,7 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy) &start, &end, EXTENT_DIRTY); if (ret) break; + set_extent_dirty(copy, start, end, GFP_NOFS); last = end + 1; } @@ -3058,6 +3192,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, cond_resched(); } + return ret; } @@ -3436,6 +3571,45 @@ static u64 stripe_align(struct btrfs_root *root, u64 val) } /* + * when we wait for progress in the block group caching, its because + * our allocation attempt failed at least once. So, we must sleep + * and let some progress happen before we try again. + * + * This function will sleep at least once waiting for new free space to + * show up, and then it will check the block group free space numbers + * for our min num_bytes. Another option is to have it go ahead + * and look in the rbtree for a free extent of a given size, but this + * is a good start. + */ +static noinline int +wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, + u64 num_bytes) +{ + DEFINE_WAIT(wait); + + prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE); + + if (block_group_cache_done(cache)) { + finish_wait(&cache->caching_q, &wait); + return 0; + } + schedule(); + finish_wait(&cache->caching_q, &wait); + + wait_event(cache->caching_q, block_group_cache_done(cache) || + (cache->free_space >= num_bytes)); + return 0; +} + +enum btrfs_loop_type { + LOOP_CACHED_ONLY = 0, + LOOP_CACHING_NOWAIT = 1, + LOOP_CACHING_WAIT = 2, + LOOP_ALLOC_CHUNK = 3, + LOOP_NO_EMPTY_SIZE = 4, +}; + +/* * walks the btree of allocated extents and find a hole of a given size. * The key ins is changed to record the hole: * ins->objectid == block start @@ -3460,6 +3634,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_space_info *space_info; int last_ptr_loop = 0; int loop = 0; + bool found_uncached_bg = false; WARN_ON(num_bytes < root->sectorsize); btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); @@ -3491,15 +3666,18 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, search_start = max(search_start, first_logical_byte(root, 0)); search_start = max(search_start, hint_byte); - if (!last_ptr) { + if (!last_ptr) empty_cluster = 0; - loop = 1; - } if (search_start == hint_byte) { block_group = btrfs_lookup_block_group(root->fs_info, search_start); - if (block_group && block_group_bits(block_group, data)) { + /* + * we don't want to use the block group if it doesn't match our + * allocation bits, or if its not cached. + */ + if (block_group && block_group_bits(block_group, data) && + block_group_cache_done(block_group)) { down_read(&space_info->groups_sem); if (list_empty(&block_group->list) || block_group->ro) { @@ -3522,21 +3700,35 @@ search: down_read(&space_info->groups_sem); list_for_each_entry(block_group, &space_info->block_groups, list) { u64 offset; + int cached; atomic_inc(&block_group->count); search_start = block_group->key.objectid; have_block_group: - if (unlikely(!block_group->cached)) { - mutex_lock(&block_group->cache_mutex); - ret = cache_block_group(root, block_group); - mutex_unlock(&block_group->cache_mutex); - if (ret) { - btrfs_put_block_group(block_group); - break; + if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { + /* + * we want to start caching kthreads, but not too many + * right off the bat so we don't overwhelm the system, + * so only start them if there are less than 2 and we're + * in the initial allocation phase. + */ + if (loop > LOOP_CACHING_NOWAIT || + atomic_read(&space_info->caching_threads) < 2) { + ret = cache_block_group(block_group); + BUG_ON(ret); } } + cached = block_group_cache_done(block_group); + if (unlikely(!cached)) { + found_uncached_bg = true; + + /* if we only want cached bgs, loop */ + if (loop == LOOP_CACHED_ONLY) + goto loop; + } + if (unlikely(block_group->ro)) goto loop; @@ -3615,14 +3807,21 @@ refill_cluster: spin_unlock(&last_ptr->refill_lock); goto checks; } + } else if (!cached && loop > LOOP_CACHING_NOWAIT) { + spin_unlock(&last_ptr->refill_lock); + + wait_block_group_cache_progress(block_group, + num_bytes + empty_cluster + empty_size); + goto have_block_group; } + /* * at this point we either didn't find a cluster * or we weren't able to allocate a block from our * cluster. Free the cluster we've been trying * to use, and go to the next block group */ - if (loop < 2) { + if (loop < LOOP_NO_EMPTY_SIZE) { btrfs_return_cluster_to_free_space(NULL, last_ptr); spin_unlock(&last_ptr->refill_lock); @@ -3633,11 +3832,17 @@ refill_cluster: offset = btrfs_find_space_for_alloc(block_group, search_start, num_bytes, empty_size); - if (!offset) + if (!offset && (cached || (!cached && + loop == LOOP_CACHING_NOWAIT))) { goto loop; + } else if (!offset && (!cached && + loop > LOOP_CACHING_NOWAIT)) { + wait_block_group_cache_progress(block_group, + num_bytes + empty_size); + goto have_block_group; + } checks: search_start = stripe_align(root, offset); - /* move on to the next group */ if (search_start + num_bytes >= search_end) { btrfs_add_free_space(block_group, offset, num_bytes); @@ -3683,13 +3888,26 @@ loop: } up_read(&space_info->groups_sem); - /* loop == 0, try to find a clustered alloc in every block group - * loop == 1, try again after forcing a chunk allocation - * loop == 2, set empty_size and empty_cluster to 0 and try again + /* LOOP_CACHED_ONLY, only search fully cached block groups + * LOOP_CACHING_NOWAIT, search partially cached block groups, but + * dont wait foR them to finish caching + * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching + * LOOP_ALLOC_CHUNK, force a chunk allocation and try again + * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try + * again */ - if (!ins->objectid && loop < 3 && - (empty_size || empty_cluster || allowed_chunk_alloc)) { - if (loop >= 2) { + if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && + (found_uncached_bg || empty_size || empty_cluster || + allowed_chunk_alloc)) { + if (found_uncached_bg) { + found_uncached_bg = false; + if (loop < LOOP_CACHING_WAIT) { + loop++; + goto search; + } + } + + if (loop == LOOP_ALLOC_CHUNK) { empty_size = 0; empty_cluster = 0; } @@ -3702,7 +3920,7 @@ loop: space_info->force_alloc = 1; } - if (loop < 3) { + if (loop < LOOP_NO_EMPTY_SIZE) { loop++; goto search; } @@ -3798,7 +4016,7 @@ again: num_bytes, data, 1); goto again; } - if (ret) { + if (ret == -ENOSPC) { struct btrfs_space_info *sinfo; sinfo = __find_space_info(root->fs_info, data); @@ -3806,7 +4024,6 @@ again: "wanted %llu\n", (unsigned long long)data, (unsigned long long)num_bytes); dump_space_info(sinfo, num_bytes); - BUG(); } return ret; @@ -3844,7 +4061,9 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size, empty_size, hint_byte, search_end, ins, data); - update_reserved_extents(root, ins->objectid, ins->offset, 1); + if (!ret) + update_reserved_extents(root, ins->objectid, ins->offset, 1); + return ret; } @@ -4006,9 +4225,9 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group; block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); - mutex_lock(&block_group->cache_mutex); - cache_block_group(root, block_group); - mutex_unlock(&block_group->cache_mutex); + cache_block_group(block_group); + wait_event(block_group->caching_q, + block_group_cache_done(block_group)); ret = btrfs_remove_free_space(block_group, ins->objectid, ins->offset); @@ -4039,7 +4258,8 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans, ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes, empty_size, hint_byte, search_end, ins, 0); - BUG_ON(ret); + if (ret) + return ret; if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { if (parent == 0) @@ -6955,11 +7175,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) &info->block_group_cache_tree); spin_unlock(&info->block_group_cache_lock); - btrfs_remove_free_space_cache(block_group); down_write(&block_group->space_info->groups_sem); list_del(&block_group->list); up_write(&block_group->space_info->groups_sem); + if (block_group->cached == BTRFS_CACHE_STARTED) + wait_event(block_group->caching_q, + block_group_cache_done(block_group)); + + btrfs_remove_free_space_cache(block_group); + WARN_ON(atomic_read(&block_group->count) != 1); kfree(block_group); @@ -7025,9 +7250,19 @@ int btrfs_read_block_groups(struct btrfs_root *root) atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); spin_lock_init(&cache->tree_lock); - mutex_init(&cache->cache_mutex); + cache->fs_info = info; + init_waitqueue_head(&cache->caching_q); INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); + + /* + * we only want to have 32k of ram per block group for keeping + * track of free space, and if we pass 1/2 of that we want to + * start converting things over to using bitmaps + */ + cache->extents_thresh = ((1024 * 32) / 2) / + sizeof(struct btrfs_free_space); + read_extent_buffer(leaf, &cache->item, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(cache->item)); @@ -7036,6 +7271,26 @@ int btrfs_read_block_groups(struct btrfs_root *root) key.objectid = found_key.objectid + found_key.offset; btrfs_release_path(root, path); cache->flags = btrfs_block_group_flags(&cache->item); + cache->sectorsize = root->sectorsize; + + remove_sb_from_cache(root, cache); + + /* + * check for two cases, either we are full, and therefore + * don't need to bother with the caching work since we won't + * find any space, or we are empty, and we can just add all + * the space in and be done with it. This saves us _alot_ of + * time, particularly in the full case. + */ + if (found_key.offset == btrfs_block_group_used(&cache->item)) { + cache->cached = BTRFS_CACHE_FINISHED; + } else if (btrfs_block_group_used(&cache->item) == 0) { + cache->cached = BTRFS_CACHE_FINISHED; + add_new_free_space(cache, root->fs_info, + found_key.objectid, + found_key.objectid + + found_key.offset); + } ret = update_space_info(info, cache->flags, found_key.offset, btrfs_block_group_used(&cache->item), @@ -7079,10 +7334,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->key.objectid = chunk_offset; cache->key.offset = size; cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + cache->sectorsize = root->sectorsize; + + /* + * we only want to have 32k of ram per block group for keeping track + * of free space, and if we pass 1/2 of that we want to start + * converting things over to using bitmaps + */ + cache->extents_thresh = ((1024 * 32) / 2) / + sizeof(struct btrfs_free_space); atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); spin_lock_init(&cache->tree_lock); - mutex_init(&cache->cache_mutex); + init_waitqueue_head(&cache->caching_q); INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); @@ -7091,6 +7355,12 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->flags = type; btrfs_set_block_group_flags(&cache->item, type); + cache->cached = BTRFS_CACHE_FINISHED; + remove_sb_from_cache(root, cache); + + add_new_free_space(cache, root->fs_info, chunk_offset, + chunk_offset + size); + ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, &cache->space_info); BUG_ON(ret); @@ -7149,7 +7419,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, rb_erase(&block_group->cache_node, &root->fs_info->block_group_cache_tree); spin_unlock(&root->fs_info->block_group_cache_lock); - btrfs_remove_free_space_cache(block_group); + down_write(&block_group->space_info->groups_sem); /* * we must use list_del_init so people can check to see if they @@ -7158,11 +7428,18 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, list_del_init(&block_group->list); up_write(&block_group->space_info->groups_sem); + if (block_group->cached == BTRFS_CACHE_STARTED) + wait_event(block_group->caching_q, + block_group_cache_done(block_group)); + + btrfs_remove_free_space_cache(block_group); + spin_lock(&block_group->space_info->lock); block_group->space_info->total_bytes -= block_group->key.offset; block_group->space_info->bytes_readonly -= block_group->key.offset; spin_unlock(&block_group->space_info->lock); - block_group->space_info->full = 0; + + btrfs_clear_space_info_full(root->fs_info); btrfs_put_block_group(block_group); btrfs_put_block_group(block_group); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7c3cd248d8d6..4b833972273a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -22,7 +22,6 @@ #include <linux/time.h> #include <linux/init.h> #include <linux/string.h> -#include <linux/smp_lock.h> #include <linux/backing-dev.h> #include <linux/mpage.h> #include <linux/swap.h> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 4538e48581a5..5edcee3a617f 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -16,45 +16,46 @@ * Boston, MA 021110-1307, USA. */ +#include <linux/pagemap.h> #include <linux/sched.h> +#include <linux/math64.h> #include "ctree.h" #include "free-space-cache.h" #include "transaction.h" -struct btrfs_free_space { - struct rb_node bytes_index; - struct rb_node offset_index; - u64 offset; - u64 bytes; -}; +#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) +#define MAX_CACHE_BYTES_PER_GIG (32 * 1024) -static int tree_insert_offset(struct rb_root *root, u64 offset, - struct rb_node *node) +static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize, + u64 offset) { - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct btrfs_free_space *info; + BUG_ON(offset < bitmap_start); + offset -= bitmap_start; + return (unsigned long)(div64_u64(offset, sectorsize)); +} - while (*p) { - parent = *p; - info = rb_entry(parent, struct btrfs_free_space, offset_index); +static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize) +{ + return (unsigned long)(div64_u64(bytes, sectorsize)); +} - if (offset < info->offset) - p = &(*p)->rb_left; - else if (offset > info->offset) - p = &(*p)->rb_right; - else - return -EEXIST; - } +static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group, + u64 offset) +{ + u64 bitmap_start; + u64 bytes_per_bitmap; - rb_link_node(node, parent, p); - rb_insert_color(node, root); + bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize; + bitmap_start = offset - block_group->key.objectid; + bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); + bitmap_start *= bytes_per_bitmap; + bitmap_start += block_group->key.objectid; - return 0; + return bitmap_start; } -static int tree_insert_bytes(struct rb_root *root, u64 bytes, - struct rb_node *node) +static int tree_insert_offset(struct rb_root *root, u64 offset, + struct rb_node *node, int bitmap) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; @@ -62,12 +63,34 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes, while (*p) { parent = *p; - info = rb_entry(parent, struct btrfs_free_space, bytes_index); + info = rb_entry(parent, struct btrfs_free_space, offset_index); - if (bytes < info->bytes) + if (offset < info->offset) { p = &(*p)->rb_left; - else + } else if (offset > info->offset) { p = &(*p)->rb_right; + } else { + /* + * we could have a bitmap entry and an extent entry + * share the same offset. If this is the case, we want + * the extent entry to always be found first if we do a + * linear search through the tree, since we want to have + * the quickest allocation time, and allocating from an + * extent is faster than allocating from a bitmap. So + * if we're inserting a bitmap and we find an entry at + * this offset, we want to go right, or after this entry + * logically. If we are inserting an extent and we've + * found a bitmap, we want to go left, or before + * logically. + */ + if (bitmap) { + WARN_ON(info->bitmap); + p = &(*p)->rb_right; + } else { + WARN_ON(!info->bitmap); + p = &(*p)->rb_left; + } + } } rb_link_node(node, parent, p); @@ -79,110 +102,143 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes, /* * searches the tree for the given offset. * - * fuzzy == 1: this is used for allocations where we are given a hint of where - * to look for free space. Because the hint may not be completely on an offset - * mark, or the hint may no longer point to free space we need to fudge our - * results a bit. So we look for free space starting at or after offset with at - * least bytes size. We prefer to find as close to the given offset as we can. - * Also if the offset is within a free space range, then we will return the free - * space that contains the given offset, which means we can return a free space - * chunk with an offset before the provided offset. - * - * fuzzy == 0: this is just a normal tree search. Give us the free space that - * starts at the given offset which is at least bytes size, and if its not there - * return NULL. + * fuzzy - If this is set, then we are trying to make an allocation, and we just + * want a section that has at least bytes size and comes at or after the given + * offset. */ -static struct btrfs_free_space *tree_search_offset(struct rb_root *root, - u64 offset, u64 bytes, - int fuzzy) +static struct btrfs_free_space * +tree_search_offset(struct btrfs_block_group_cache *block_group, + u64 offset, int bitmap_only, int fuzzy) { - struct rb_node *n = root->rb_node; - struct btrfs_free_space *entry, *ret = NULL; + struct rb_node *n = block_group->free_space_offset.rb_node; + struct btrfs_free_space *entry, *prev = NULL; + + /* find entry that is closest to the 'offset' */ + while (1) { + if (!n) { + entry = NULL; + break; + } - while (n) { entry = rb_entry(n, struct btrfs_free_space, offset_index); + prev = entry; - if (offset < entry->offset) { - if (fuzzy && - (!ret || entry->offset < ret->offset) && - (bytes <= entry->bytes)) - ret = entry; + if (offset < entry->offset) n = n->rb_left; - } else if (offset > entry->offset) { - if (fuzzy && - (entry->offset + entry->bytes - 1) >= offset && - bytes <= entry->bytes) { - ret = entry; - break; - } + else if (offset > entry->offset) n = n->rb_right; - } else { - if (bytes > entry->bytes) { - n = n->rb_right; - continue; - } - ret = entry; + else break; - } } - return ret; -} + if (bitmap_only) { + if (!entry) + return NULL; + if (entry->bitmap) + return entry; -/* - * return a chunk at least bytes size, as close to offset that we can get. - */ -static struct btrfs_free_space *tree_search_bytes(struct rb_root *root, - u64 offset, u64 bytes) -{ - struct rb_node *n = root->rb_node; - struct btrfs_free_space *entry, *ret = NULL; - - while (n) { - entry = rb_entry(n, struct btrfs_free_space, bytes_index); + /* + * bitmap entry and extent entry may share same offset, + * in that case, bitmap entry comes after extent entry. + */ + n = rb_next(n); + if (!n) + return NULL; + entry = rb_entry(n, struct btrfs_free_space, offset_index); + if (entry->offset != offset) + return NULL; - if (bytes < entry->bytes) { + WARN_ON(!entry->bitmap); + return entry; + } else if (entry) { + if (entry->bitmap) { /* - * We prefer to get a hole size as close to the size we - * are asking for so we don't take small slivers out of - * huge holes, but we also want to get as close to the - * offset as possible so we don't have a whole lot of - * fragmentation. + * if previous extent entry covers the offset, + * we should return it instead of the bitmap entry */ - if (offset <= entry->offset) { - if (!ret) - ret = entry; - else if (entry->bytes < ret->bytes) - ret = entry; - else if (entry->offset < ret->offset) - ret = entry; + n = &entry->offset_index; + while (1) { + n = rb_prev(n); + if (!n) + break; + prev = rb_entry(n, struct btrfs_free_space, + offset_index); + if (!prev->bitmap) { + if (prev->offset + prev->bytes > offset) + entry = prev; + break; + } } - n = n->rb_left; - } else if (bytes > entry->bytes) { - n = n->rb_right; + } + return entry; + } + + if (!prev) + return NULL; + + /* find last entry before the 'offset' */ + entry = prev; + if (entry->offset > offset) { + n = rb_prev(&entry->offset_index); + if (n) { + entry = rb_entry(n, struct btrfs_free_space, + offset_index); + BUG_ON(entry->offset > offset); } else { - /* - * Ok we may have multiple chunks of the wanted size, - * so we don't want to take the first one we find, we - * want to take the one closest to our given offset, so - * keep searching just in case theres a better match. - */ - n = n->rb_right; - if (offset > entry->offset) - continue; - else if (!ret || entry->offset < ret->offset) - ret = entry; + if (fuzzy) + return entry; + else + return NULL; } } - return ret; + if (entry->bitmap) { + n = &entry->offset_index; + while (1) { + n = rb_prev(n); + if (!n) + break; + prev = rb_entry(n, struct btrfs_free_space, + offset_index); + if (!prev->bitmap) { + if (prev->offset + prev->bytes > offset) + return prev; + break; + } + } + if (entry->offset + BITS_PER_BITMAP * + block_group->sectorsize > offset) + return entry; + } else if (entry->offset + entry->bytes > offset) + return entry; + + if (!fuzzy) + return NULL; + + while (1) { + if (entry->bitmap) { + if (entry->offset + BITS_PER_BITMAP * + block_group->sectorsize > offset) + break; + } else { + if (entry->offset + entry->bytes > offset) + break; + } + + n = rb_next(&entry->offset_index); + if (!n) + return NULL; + entry = rb_entry(n, struct btrfs_free_space, offset_index); + } + return entry; } static void unlink_free_space(struct btrfs_block_group_cache *block_group, struct btrfs_free_space *info) { rb_erase(&info->offset_index, &block_group->free_space_offset); - rb_erase(&info->bytes_index, &block_group->free_space_bytes); + block_group->free_extents--; + block_group->free_space -= info->bytes; } static int link_free_space(struct btrfs_block_group_cache *block_group, @@ -190,17 +246,353 @@ static int link_free_space(struct btrfs_block_group_cache *block_group, { int ret = 0; - - BUG_ON(!info->bytes); + BUG_ON(!info->bitmap && !info->bytes); ret = tree_insert_offset(&block_group->free_space_offset, info->offset, - &info->offset_index); + &info->offset_index, (info->bitmap != NULL)); if (ret) return ret; - ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes, - &info->bytes_index); - if (ret) - return ret; + block_group->free_space += info->bytes; + block_group->free_extents++; + return ret; +} + +static void recalculate_thresholds(struct btrfs_block_group_cache *block_group) +{ + u64 max_bytes, possible_bytes; + + /* + * The goal is to keep the total amount of memory used per 1gb of space + * at or below 32k, so we need to adjust how much memory we allow to be + * used by extent based free space tracking + */ + max_bytes = MAX_CACHE_BYTES_PER_GIG * + (div64_u64(block_group->key.offset, 1024 * 1024 * 1024)); + + possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) + + (sizeof(struct btrfs_free_space) * + block_group->extents_thresh); + + if (possible_bytes > max_bytes) { + int extent_bytes = max_bytes - + (block_group->total_bitmaps * PAGE_CACHE_SIZE); + + if (extent_bytes <= 0) { + block_group->extents_thresh = 0; + return; + } + + block_group->extents_thresh = extent_bytes / + (sizeof(struct btrfs_free_space)); + } +} + +static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info, u64 offset, + u64 bytes) +{ + unsigned long start, end; + unsigned long i; + + start = offset_to_bit(info->offset, block_group->sectorsize, offset); + end = start + bytes_to_bits(bytes, block_group->sectorsize); + BUG_ON(end > BITS_PER_BITMAP); + + for (i = start; i < end; i++) + clear_bit(i, info->bitmap); + + info->bytes -= bytes; + block_group->free_space -= bytes; +} + +static void bitmap_set_bits(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info, u64 offset, + u64 bytes) +{ + unsigned long start, end; + unsigned long i; + + start = offset_to_bit(info->offset, block_group->sectorsize, offset); + end = start + bytes_to_bits(bytes, block_group->sectorsize); + BUG_ON(end > BITS_PER_BITMAP); + + for (i = start; i < end; i++) + set_bit(i, info->bitmap); + + info->bytes += bytes; + block_group->free_space += bytes; +} + +static int search_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *bitmap_info, u64 *offset, + u64 *bytes) +{ + unsigned long found_bits = 0; + unsigned long bits, i; + unsigned long next_zero; + + i = offset_to_bit(bitmap_info->offset, block_group->sectorsize, + max_t(u64, *offset, bitmap_info->offset)); + bits = bytes_to_bits(*bytes, block_group->sectorsize); + + for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i); + i < BITS_PER_BITMAP; + i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) { + next_zero = find_next_zero_bit(bitmap_info->bitmap, + BITS_PER_BITMAP, i); + if ((next_zero - i) >= bits) { + found_bits = next_zero - i; + break; + } + i = next_zero; + } + + if (found_bits) { + *offset = (u64)(i * block_group->sectorsize) + + bitmap_info->offset; + *bytes = (u64)(found_bits) * block_group->sectorsize; + return 0; + } + + return -1; +} + +static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache + *block_group, u64 *offset, + u64 *bytes, int debug) +{ + struct btrfs_free_space *entry; + struct rb_node *node; + int ret; + + if (!block_group->free_space_offset.rb_node) + return NULL; + + entry = tree_search_offset(block_group, + offset_to_bitmap(block_group, *offset), + 0, 1); + if (!entry) + return NULL; + + for (node = &entry->offset_index; node; node = rb_next(node)) { + entry = rb_entry(node, struct btrfs_free_space, offset_index); + if (entry->bytes < *bytes) + continue; + + if (entry->bitmap) { + ret = search_bitmap(block_group, entry, offset, bytes); + if (!ret) + return entry; + continue; + } + + *offset = entry->offset; + *bytes = entry->bytes; + return entry; + } + + return NULL; +} + +static void add_new_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info, u64 offset) +{ + u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize; + int max_bitmaps = (int)div64_u64(block_group->key.offset + + bytes_per_bg - 1, bytes_per_bg); + BUG_ON(block_group->total_bitmaps >= max_bitmaps); + + info->offset = offset_to_bitmap(block_group, offset); + link_free_space(block_group, info); + block_group->total_bitmaps++; + + recalculate_thresholds(block_group); +} + +static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *bitmap_info, + u64 *offset, u64 *bytes) +{ + u64 end; + u64 search_start, search_bytes; + int ret; + +again: + end = bitmap_info->offset + + (u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1; + + /* + * XXX - this can go away after a few releases. + * + * since the only user of btrfs_remove_free_space is the tree logging + * stuff, and the only way to test that is under crash conditions, we + * want to have this debug stuff here just in case somethings not + * working. Search the bitmap for the space we are trying to use to + * make sure its actually there. If its not there then we need to stop + * because something has gone wrong. + */ + search_start = *offset; + search_bytes = *bytes; + ret = search_bitmap(block_group, bitmap_info, &search_start, + &search_bytes); + BUG_ON(ret < 0 || search_start != *offset); + + if (*offset > bitmap_info->offset && *offset + *bytes > end) { + bitmap_clear_bits(block_group, bitmap_info, *offset, + end - *offset + 1); + *bytes -= end - *offset + 1; + *offset = end + 1; + } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) { + bitmap_clear_bits(block_group, bitmap_info, *offset, *bytes); + *bytes = 0; + } + + if (*bytes) { + struct rb_node *next = rb_next(&bitmap_info->offset_index); + if (!bitmap_info->bytes) { + unlink_free_space(block_group, bitmap_info); + kfree(bitmap_info->bitmap); + kfree(bitmap_info); + block_group->total_bitmaps--; + recalculate_thresholds(block_group); + } + + /* + * no entry after this bitmap, but we still have bytes to + * remove, so something has gone wrong. + */ + if (!next) + return -EINVAL; + + bitmap_info = rb_entry(next, struct btrfs_free_space, + offset_index); + + /* + * if the next entry isn't a bitmap we need to return to let the + * extent stuff do its work. + */ + if (!bitmap_info->bitmap) + return -EAGAIN; + + /* + * Ok the next item is a bitmap, but it may not actually hold + * the information for the rest of this free space stuff, so + * look for it, and if we don't find it return so we can try + * everything over again. + */ + search_start = *offset; + search_bytes = *bytes; + ret = search_bitmap(block_group, bitmap_info, &search_start, + &search_bytes); + if (ret < 0 || search_start != *offset) + return -EAGAIN; + + goto again; + } else if (!bitmap_info->bytes) { + unlink_free_space(block_group, bitmap_info); + kfree(bitmap_info->bitmap); + kfree(bitmap_info); + block_group->total_bitmaps--; + recalculate_thresholds(block_group); + } + + return 0; +} + +static int insert_into_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info) +{ + struct btrfs_free_space *bitmap_info; + int added = 0; + u64 bytes, offset, end; + int ret; + + /* + * If we are below the extents threshold then we can add this as an + * extent, and don't have to deal with the bitmap + */ + if (block_group->free_extents < block_group->extents_thresh && + info->bytes > block_group->sectorsize * 4) + return 0; + + /* + * some block groups are so tiny they can't be enveloped by a bitmap, so + * don't even bother to create a bitmap for this + */ + if (BITS_PER_BITMAP * block_group->sectorsize > + block_group->key.offset) + return 0; + + bytes = info->bytes; + offset = info->offset; + +again: + bitmap_info = tree_search_offset(block_group, + offset_to_bitmap(block_group, offset), + 1, 0); + if (!bitmap_info) { + BUG_ON(added); + goto new_bitmap; + } + + end = bitmap_info->offset + + (u64)(BITS_PER_BITMAP * block_group->sectorsize); + + if (offset >= bitmap_info->offset && offset + bytes > end) { + bitmap_set_bits(block_group, bitmap_info, offset, + end - offset); + bytes -= end - offset; + offset = end; + added = 0; + } else if (offset >= bitmap_info->offset && offset + bytes <= end) { + bitmap_set_bits(block_group, bitmap_info, offset, bytes); + bytes = 0; + } else { + BUG(); + } + + if (!bytes) { + ret = 1; + goto out; + } else + goto again; + +new_bitmap: + if (info && info->bitmap) { + add_new_bitmap(block_group, info, offset); + added = 1; + info = NULL; + goto again; + } else { + spin_unlock(&block_group->tree_lock); + + /* no pre-allocated info, allocate a new one */ + if (!info) { + info = kzalloc(sizeof(struct btrfs_free_space), + GFP_NOFS); + if (!info) { + spin_lock(&block_group->tree_lock); + ret = -ENOMEM; + goto out; + } + } + + /* allocate the bitmap */ + info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); + spin_lock(&block_group->tree_lock); + if (!info->bitmap) { + ret = -ENOMEM; + goto out; + } + goto again; + } + +out: + if (info) { + if (info->bitmap) + kfree(info->bitmap); + kfree(info); + } return ret; } @@ -208,8 +600,8 @@ static int link_free_space(struct btrfs_block_group_cache *block_group, int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, u64 offset, u64 bytes) { - struct btrfs_free_space *right_info; - struct btrfs_free_space *left_info; + struct btrfs_free_space *right_info = NULL; + struct btrfs_free_space *left_info = NULL; struct btrfs_free_space *info = NULL; int ret = 0; @@ -227,18 +619,38 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, * are adding, if there is remove that struct and add a new one to * cover the entire range */ - right_info = tree_search_offset(&block_group->free_space_offset, - offset+bytes, 0, 0); - left_info = tree_search_offset(&block_group->free_space_offset, - offset-1, 0, 1); + right_info = tree_search_offset(block_group, offset + bytes, 0, 0); + if (right_info && rb_prev(&right_info->offset_index)) + left_info = rb_entry(rb_prev(&right_info->offset_index), + struct btrfs_free_space, offset_index); + else + left_info = tree_search_offset(block_group, offset - 1, 0, 0); + + /* + * If there was no extent directly to the left or right of this new + * extent then we know we're going to have to allocate a new extent, so + * before we do that see if we need to drop this into a bitmap + */ + if ((!left_info || left_info->bitmap) && + (!right_info || right_info->bitmap)) { + ret = insert_into_bitmap(block_group, info); + + if (ret < 0) { + goto out; + } else if (ret) { + ret = 0; + goto out; + } + } - if (right_info) { + if (right_info && !right_info->bitmap) { unlink_free_space(block_group, right_info); info->bytes += right_info->bytes; kfree(right_info); } - if (left_info && left_info->offset + left_info->bytes == offset) { + if (left_info && !left_info->bitmap && + left_info->offset + left_info->bytes == offset) { unlink_free_space(block_group, left_info); info->offset = left_info->offset; info->bytes += left_info->bytes; @@ -248,11 +660,11 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, ret = link_free_space(block_group, info); if (ret) kfree(info); - +out: spin_unlock(&block_group->tree_lock); if (ret) { - printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret); + printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret); BUG_ON(ret == -EEXIST); } @@ -263,40 +675,74 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, u64 offset, u64 bytes) { struct btrfs_free_space *info; + struct btrfs_free_space *next_info = NULL; int ret = 0; spin_lock(&block_group->tree_lock); - info = tree_search_offset(&block_group->free_space_offset, offset, 0, - 1); - if (info && info->offset == offset) { - if (info->bytes < bytes) { - printk(KERN_ERR "Found free space at %llu, size %llu," - "trying to use %llu\n", - (unsigned long long)info->offset, - (unsigned long long)info->bytes, - (unsigned long long)bytes); +again: + info = tree_search_offset(block_group, offset, 0, 0); + if (!info) { + /* + * oops didn't find an extent that matched the space we wanted + * to remove, look for a bitmap instead + */ + info = tree_search_offset(block_group, + offset_to_bitmap(block_group, offset), + 1, 0); + if (!info) { + WARN_ON(1); + goto out_lock; + } + } + + if (info->bytes < bytes && rb_next(&info->offset_index)) { + u64 end; + next_info = rb_entry(rb_next(&info->offset_index), + struct btrfs_free_space, + offset_index); + + if (next_info->bitmap) + end = next_info->offset + BITS_PER_BITMAP * + block_group->sectorsize - 1; + else + end = next_info->offset + next_info->bytes; + + if (next_info->bytes < bytes || + next_info->offset > offset || offset > end) { + printk(KERN_CRIT "Found free space at %llu, size %llu," + " trying to use %llu\n", + (unsigned long long)info->offset, + (unsigned long long)info->bytes, + (unsigned long long)bytes); WARN_ON(1); ret = -EINVAL; - spin_unlock(&block_group->tree_lock); - goto out; + goto out_lock; } - unlink_free_space(block_group, info); - if (info->bytes == bytes) { - kfree(info); - spin_unlock(&block_group->tree_lock); - goto out; + info = next_info; + } + + if (info->bytes == bytes) { + unlink_free_space(block_group, info); + if (info->bitmap) { + kfree(info->bitmap); + block_group->total_bitmaps--; } + kfree(info); + goto out_lock; + } + if (!info->bitmap && info->offset == offset) { + unlink_free_space(block_group, info); info->offset += bytes; info->bytes -= bytes; + link_free_space(block_group, info); + goto out_lock; + } - ret = link_free_space(block_group, info); - spin_unlock(&block_group->tree_lock); - BUG_ON(ret); - } else if (info && info->offset < offset && - info->offset + info->bytes >= offset + bytes) { + if (!info->bitmap && info->offset <= offset && + info->offset + info->bytes >= offset + bytes) { u64 old_start = info->offset; /* * we're freeing space in the middle of the info, @@ -312,7 +758,9 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, info->offset = offset + bytes; info->bytes = old_end - info->offset; ret = link_free_space(block_group, info); - BUG_ON(ret); + WARN_ON(ret); + if (ret) + goto out_lock; } else { /* the hole we're creating ends at the end * of the info struct, just free the info @@ -320,32 +768,22 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, kfree(info); } spin_unlock(&block_group->tree_lock); - /* step two, insert a new info struct to cover anything - * before the hole + + /* step two, insert a new info struct to cover + * anything before the hole */ ret = btrfs_add_free_space(block_group, old_start, offset - old_start); - BUG_ON(ret); - } else { - spin_unlock(&block_group->tree_lock); - if (!info) { - printk(KERN_ERR "couldn't find space %llu to free\n", - (unsigned long long)offset); - printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n", - block_group->cached, - (unsigned long long)block_group->key.objectid, - (unsigned long long)block_group->key.offset); - btrfs_dump_free_space(block_group, bytes); - } else if (info) { - printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, " - "but wanted offset=%llu bytes=%llu\n", - (unsigned long long)info->offset, - (unsigned long long)info->bytes, - (unsigned long long)offset, - (unsigned long long)bytes); - } - WARN_ON(1); + WARN_ON(ret); + goto out; } + + ret = remove_from_bitmap(block_group, info, &offset, &bytes); + if (ret == -EAGAIN) + goto again; + BUG_ON(ret); +out_lock: + spin_unlock(&block_group->tree_lock); out: return ret; } @@ -361,10 +799,13 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, info = rb_entry(n, struct btrfs_free_space, offset_index); if (info->bytes >= bytes) count++; - printk(KERN_ERR "entry offset %llu, bytes %llu\n", + printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n", (unsigned long long)info->offset, - (unsigned long long)info->bytes); + (unsigned long long)info->bytes, + (info->bitmap) ? "yes" : "no"); } + printk(KERN_INFO "block group has cluster?: %s\n", + list_empty(&block_group->cluster_list) ? "no" : "yes"); printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" "\n", count); } @@ -397,26 +838,35 @@ __btrfs_return_cluster_to_free_space( { struct btrfs_free_space *entry; struct rb_node *node; + bool bitmap; spin_lock(&cluster->lock); if (cluster->block_group != block_group) goto out; + bitmap = cluster->points_to_bitmap; + cluster->block_group = NULL; cluster->window_start = 0; + list_del_init(&cluster->block_group_list); + cluster->points_to_bitmap = false; + + if (bitmap) + goto out; + node = rb_first(&cluster->root); - while(node) { + while (node) { entry = rb_entry(node, struct btrfs_free_space, offset_index); node = rb_next(&entry->offset_index); rb_erase(&entry->offset_index, &cluster->root); - link_free_space(block_group, entry); + BUG_ON(entry->bitmap); + tree_insert_offset(&block_group->free_space_offset, + entry->offset, &entry->offset_index, 0); } - list_del_init(&cluster->block_group_list); - - btrfs_put_block_group(cluster->block_group); - cluster->block_group = NULL; cluster->root.rb_node = NULL; + out: spin_unlock(&cluster->lock); + btrfs_put_block_group(block_group); return 0; } @@ -425,20 +875,28 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) struct btrfs_free_space *info; struct rb_node *node; struct btrfs_free_cluster *cluster; - struct btrfs_free_cluster *safe; + struct list_head *head; spin_lock(&block_group->tree_lock); - - list_for_each_entry_safe(cluster, safe, &block_group->cluster_list, - block_group_list) { + while ((head = block_group->cluster_list.next) != + &block_group->cluster_list) { + cluster = list_entry(head, struct btrfs_free_cluster, + block_group_list); WARN_ON(cluster->block_group != block_group); __btrfs_return_cluster_to_free_space(block_group, cluster); + if (need_resched()) { + spin_unlock(&block_group->tree_lock); + cond_resched(); + spin_lock(&block_group->tree_lock); + } } - while ((node = rb_last(&block_group->free_space_bytes)) != NULL) { - info = rb_entry(node, struct btrfs_free_space, bytes_index); + while ((node = rb_last(&block_group->free_space_offset)) != NULL) { + info = rb_entry(node, struct btrfs_free_space, offset_index); unlink_free_space(block_group, info); + if (info->bitmap) + kfree(info->bitmap); kfree(info); if (need_resched()) { spin_unlock(&block_group->tree_lock); @@ -446,6 +904,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) spin_lock(&block_group->tree_lock); } } + spin_unlock(&block_group->tree_lock); } @@ -453,25 +912,35 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, u64 offset, u64 bytes, u64 empty_size) { struct btrfs_free_space *entry = NULL; + u64 bytes_search = bytes + empty_size; u64 ret = 0; spin_lock(&block_group->tree_lock); - entry = tree_search_offset(&block_group->free_space_offset, offset, - bytes + empty_size, 1); + entry = find_free_space(block_group, &offset, &bytes_search, 0); if (!entry) - entry = tree_search_bytes(&block_group->free_space_bytes, - offset, bytes + empty_size); - if (entry) { + goto out; + + ret = offset; + if (entry->bitmap) { + bitmap_clear_bits(block_group, entry, offset, bytes); + if (!entry->bytes) { + unlink_free_space(block_group, entry); + kfree(entry->bitmap); + kfree(entry); + block_group->total_bitmaps--; + recalculate_thresholds(block_group); + } + } else { unlink_free_space(block_group, entry); - ret = entry->offset; entry->offset += bytes; entry->bytes -= bytes; - if (!entry->bytes) kfree(entry); else link_free_space(block_group, entry); } + +out: spin_unlock(&block_group->tree_lock); return ret; @@ -517,6 +986,54 @@ int btrfs_return_cluster_to_free_space( return ret; } +static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + u64 bytes, u64 min_start) +{ + struct btrfs_free_space *entry; + int err; + u64 search_start = cluster->window_start; + u64 search_bytes = bytes; + u64 ret = 0; + + spin_lock(&block_group->tree_lock); + spin_lock(&cluster->lock); + + if (!cluster->points_to_bitmap) + goto out; + + if (cluster->block_group != block_group) + goto out; + + /* + * search_start is the beginning of the bitmap, but at some point it may + * be a good idea to point to the actual start of the free area in the + * bitmap, so do the offset_to_bitmap trick anyway, and set bitmap_only + * to 1 to make sure we get the bitmap entry + */ + entry = tree_search_offset(block_group, + offset_to_bitmap(block_group, search_start), + 1, 0); + if (!entry || !entry->bitmap) + goto out; + + search_start = min_start; + search_bytes = bytes; + + err = search_bitmap(block_group, entry, &search_start, + &search_bytes); + if (err) + goto out; + + ret = search_start; + bitmap_clear_bits(block_group, entry, ret, bytes); +out: + spin_unlock(&cluster->lock); + spin_unlock(&block_group->tree_lock); + + return ret; +} + /* * given a cluster, try to allocate 'bytes' from it, returns 0 * if it couldn't find anything suitably large, or a logical disk offset @@ -530,6 +1047,10 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, struct rb_node *node; u64 ret = 0; + if (cluster->points_to_bitmap) + return btrfs_alloc_from_bitmap(block_group, cluster, bytes, + min_start); + spin_lock(&cluster->lock); if (bytes > cluster->max_size) goto out; @@ -567,9 +1088,73 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, } out: spin_unlock(&cluster->lock); + return ret; } +static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *entry, + struct btrfs_free_cluster *cluster, + u64 offset, u64 bytes, u64 min_bytes) +{ + unsigned long next_zero; + unsigned long i; + unsigned long search_bits; + unsigned long total_bits; + unsigned long found_bits; + unsigned long start = 0; + unsigned long total_found = 0; + bool found = false; + + i = offset_to_bit(entry->offset, block_group->sectorsize, + max_t(u64, offset, entry->offset)); + search_bits = bytes_to_bits(min_bytes, block_group->sectorsize); + total_bits = bytes_to_bits(bytes, block_group->sectorsize); + +again: + found_bits = 0; + for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i); + i < BITS_PER_BITMAP; + i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) { + next_zero = find_next_zero_bit(entry->bitmap, + BITS_PER_BITMAP, i); + if (next_zero - i >= search_bits) { + found_bits = next_zero - i; + break; + } + i = next_zero; + } + + if (!found_bits) + return -1; + + if (!found) { + start = i; + found = true; + } + + total_found += found_bits; + + if (cluster->max_size < found_bits * block_group->sectorsize) + cluster->max_size = found_bits * block_group->sectorsize; + + if (total_found < total_bits) { + i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero); + if (i - start > total_bits * 2) { + total_found = 0; + cluster->max_size = 0; + found = false; + } + goto again; + } + + cluster->window_start = start * block_group->sectorsize + + entry->offset; + cluster->points_to_bitmap = true; + + return 0; +} + /* * here we try to find a cluster of blocks in a block group. The goal * is to find at least bytes free and up to empty_size + bytes free. @@ -587,12 +1172,12 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, struct btrfs_free_space *entry = NULL; struct rb_node *node; struct btrfs_free_space *next; - struct btrfs_free_space *last; + struct btrfs_free_space *last = NULL; u64 min_bytes; u64 window_start; u64 window_free; u64 max_extent = 0; - int total_retries = 0; + bool found_bitmap = false; int ret; /* for metadata, allow allocates with more holes */ @@ -620,31 +1205,80 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, goto out; } again: - min_bytes = min(min_bytes, bytes + empty_size); - entry = tree_search_bytes(&block_group->free_space_bytes, - offset, min_bytes); + entry = tree_search_offset(block_group, offset, found_bitmap, 1); if (!entry) { ret = -ENOSPC; goto out; } + + /* + * If found_bitmap is true, we exhausted our search for extent entries, + * and we just want to search all of the bitmaps that we can find, and + * ignore any extent entries we find. + */ + while (entry->bitmap || found_bitmap || + (!entry->bitmap && entry->bytes < min_bytes)) { + struct rb_node *node = rb_next(&entry->offset_index); + + if (entry->bitmap && entry->bytes > bytes + empty_size) { + ret = btrfs_bitmap_cluster(block_group, entry, cluster, + offset, bytes + empty_size, + min_bytes); + if (!ret) + goto got_it; + } + + if (!node) { + ret = -ENOSPC; + goto out; + } + entry = rb_entry(node, struct btrfs_free_space, offset_index); + } + + /* + * We already searched all the extent entries from the passed in offset + * to the end and didn't find enough space for the cluster, and we also + * didn't find any bitmaps that met our criteria, just go ahead and exit + */ + if (found_bitmap) { + ret = -ENOSPC; + goto out; + } + + cluster->points_to_bitmap = false; window_start = entry->offset; window_free = entry->bytes; last = entry; max_extent = entry->bytes; - while(1) { + while (1) { /* out window is just right, lets fill it */ if (window_free >= bytes + empty_size) break; node = rb_next(&last->offset_index); if (!node) { + if (found_bitmap) + goto again; ret = -ENOSPC; goto out; } next = rb_entry(node, struct btrfs_free_space, offset_index); /* + * we found a bitmap, so if this search doesn't result in a + * cluster, we know to go and search again for the bitmaps and + * start looking for space there + */ + if (next->bitmap) { + if (!found_bitmap) + offset = next->offset; + found_bitmap = true; + last = next; + continue; + } + + /* * we haven't filled the empty size and the window is * very large. reset and try again */ @@ -655,19 +1289,6 @@ again: window_free = entry->bytes; last = entry; max_extent = 0; - total_retries++; - if (total_retries % 64 == 0) { - if (min_bytes >= (bytes + empty_size)) { - ret = -ENOSPC; - goto out; - } - /* - * grow our allocation a bit, we're not having - * much luck - */ - min_bytes *= 2; - goto again; - } } else { last = next; window_free += next->bytes; @@ -685,11 +1306,19 @@ again: * The cluster includes an rbtree, but only uses the offset index * of each free space cache entry. */ - while(1) { + while (1) { node = rb_next(&entry->offset_index); - unlink_free_space(block_group, entry); + if (entry->bitmap && node) { + entry = rb_entry(node, struct btrfs_free_space, + offset_index); + continue; + } else if (entry->bitmap && !node) { + break; + } + + rb_erase(&entry->offset_index, &block_group->free_space_offset); ret = tree_insert_offset(&cluster->root, entry->offset, - &entry->offset_index); + &entry->offset_index, 0); BUG_ON(ret); if (!node || entry == last) @@ -697,8 +1326,10 @@ again: entry = rb_entry(node, struct btrfs_free_space, offset_index); } - ret = 0; + cluster->max_size = max_extent; +got_it: + ret = 0; atomic_inc(&block_group->count); list_add_tail(&cluster->block_group_list, &block_group->cluster_list); cluster->block_group = block_group; @@ -718,6 +1349,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) spin_lock_init(&cluster->refill_lock); cluster->root.rb_node = NULL; cluster->max_size = 0; + cluster->points_to_bitmap = false; INIT_LIST_HEAD(&cluster->block_group_list); cluster->block_group = NULL; } diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 266fb8764054..890a8e79011b 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -19,6 +19,14 @@ #ifndef __BTRFS_FREE_SPACE_CACHE #define __BTRFS_FREE_SPACE_CACHE +struct btrfs_free_space { + struct rb_node offset_index; + u64 offset; + u64 bytes; + unsigned long *bitmap; + struct list_head list; +}; + int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, u64 bytenr, u64 size); int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7ffa3d34ea19..59cba180fe83 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -26,7 +26,6 @@ #include <linux/time.h> #include <linux/init.h> #include <linux/string.h> -#include <linux/smp_lock.h> #include <linux/backing-dev.h> #include <linux/mpage.h> #include <linux/swap.h> @@ -2604,8 +2603,8 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, if (root->ref_cows) btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); path = btrfs_alloc_path(); - path->reada = -1; BUG_ON(!path); + path->reada = -1; /* FIXME, add redo link to tree so we don't leak on crash */ key.objectid = inode->i_ino; @@ -3100,8 +3099,12 @@ static void inode_tree_add(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_inode *entry; - struct rb_node **p = &root->inode_tree.rb_node; - struct rb_node *parent = NULL; + struct rb_node **p; + struct rb_node *parent; + +again: + p = &root->inode_tree.rb_node; + parent = NULL; spin_lock(&root->inode_lock); while (*p) { @@ -3109,13 +3112,16 @@ static void inode_tree_add(struct inode *inode) entry = rb_entry(parent, struct btrfs_inode, rb_node); if (inode->i_ino < entry->vfs_inode.i_ino) - p = &(*p)->rb_left; + p = &parent->rb_left; else if (inode->i_ino > entry->vfs_inode.i_ino) - p = &(*p)->rb_right; + p = &parent->rb_right; else { WARN_ON(!(entry->vfs_inode.i_state & (I_WILL_FREE | I_FREEING | I_CLEAR))); - break; + rb_erase(parent, &root->inode_tree); + RB_CLEAR_NODE(parent); + spin_unlock(&root->inode_lock); + goto again; } } rb_link_node(&BTRFS_I(inode)->rb_node, parent, p); @@ -3127,12 +3133,12 @@ static void inode_tree_del(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; + spin_lock(&root->inode_lock); if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { - spin_lock(&root->inode_lock); rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); - spin_unlock(&root->inode_lock); RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); } + spin_unlock(&root->inode_lock); } static noinline void init_btrfs_i(struct inode *inode) @@ -4786,8 +4792,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, * and the replacement file is large. Start IO on it now so * we don't add too much work to the end of the transaction */ - if (new_inode && old_inode && S_ISREG(old_inode->i_mode) && - new_inode->i_size && + if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size && old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) filemap_flush(old_inode->i_mapping); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9f4db848db10..bd88f25889f7 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -27,7 +27,6 @@ #include <linux/time.h> #include <linux/init.h> #include <linux/string.h> -#include <linux/smp_lock.h> #include <linux/backing-dev.h> #include <linux/mount.h> #include <linux/mpage.h> diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index d6f0806c682f..7b2f401e604e 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -740,7 +740,6 @@ int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, .nr_to_write = mapping->nrpages * 2, .range_start = start, .range_end = end, - .for_writepages = 1, }; return btrfs_writepages(mapping, &wbc); } diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 6d6523da0a30..0d126be22b63 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -309,7 +309,7 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c) } printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n", (unsigned long long)btrfs_header_bytenr(c), - btrfs_header_level(c), nr, + level, nr, (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr); for (i = 0; i < nr; i++) { btrfs_node_key_to_cpu(c, &key, i); @@ -326,10 +326,10 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c) btrfs_level_size(root, level - 1), btrfs_node_ptr_generation(c, i)); if (btrfs_is_leaf(next) && - btrfs_header_level(c) != 1) + level != 1) BUG(); if (btrfs_header_level(next) != - btrfs_header_level(c) - 1) + level - 1) BUG(); btrfs_print_tree(root, next); free_extent_buffer(next); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 008397934778..c04f7f212602 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -670,6 +670,8 @@ again: err = ret; goto out; } + if (ret > 0 && path2->slots[level] > 0) + path2->slots[level]--; eb = path2->nodes[level]; WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) != @@ -1609,6 +1611,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, BUG_ON(level == 0); path->lowest_level = level; ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); + path->lowest_level = 0; if (ret < 0) { btrfs_free_path(path); return ret; @@ -2550,8 +2553,13 @@ int relocate_inode_pages(struct inode *inode, u64 start, u64 len) last_index = (start + len - 1) >> PAGE_CACHE_SHIFT; /* make sure the dirty trick played by the caller work */ - ret = invalidate_inode_pages2_range(inode->i_mapping, - first_index, last_index); + while (1) { + ret = invalidate_inode_pages2_range(inode->i_mapping, + first_index, last_index); + if (ret != -EBUSY) + break; + schedule_timeout(HZ/10); + } if (ret) goto out_unlock; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9f179d4832d5..6d6d06cb6dfc 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -26,7 +26,6 @@ #include <linux/init.h> #include <linux/seq_file.h> #include <linux/string.h> -#include <linux/smp_lock.h> #include <linux/backing-dev.h> #include <linux/mount.h> #include <linux/mpage.h> diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2dbf1c1f56ee..cdbb5022da52 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -40,6 +40,12 @@ static noinline void put_transaction(struct btrfs_transaction *transaction) } } +static noinline void switch_commit_root(struct btrfs_root *root) +{ + free_extent_buffer(root->commit_root); + root->commit_root = btrfs_root_node(root); +} + /* * either allocate a new transaction or hop into the existing one */ @@ -444,9 +450,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, btrfs_write_dirty_block_groups(trans, root); - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - BUG_ON(ret); - while (1) { old_root_bytenr = btrfs_root_bytenr(&root->root_item); if (old_root_bytenr == root->node->start) @@ -457,13 +460,14 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, &root->root_key, &root->root_item); BUG_ON(ret); - btrfs_write_dirty_block_groups(trans, root); - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + ret = btrfs_write_dirty_block_groups(trans, root); BUG_ON(ret); } - free_extent_buffer(root->commit_root); - root->commit_root = btrfs_root_node(root); + + if (root != root->fs_info->extent_root) + switch_commit_root(root); + return 0; } @@ -495,10 +499,12 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, root = list_entry(next, struct btrfs_root, dirty_list); update_cowonly_root(trans, root); - - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - BUG_ON(ret); } + + down_write(&fs_info->extent_commit_sem); + switch_commit_root(fs_info->extent_root); + up_write(&fs_info->extent_commit_sem); + return 0; } @@ -544,8 +550,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, btrfs_update_reloc_root(trans, root); if (root->commit_root != root->node) { - free_extent_buffer(root->commit_root); - root->commit_root = btrfs_root_node(root); + switch_commit_root(root); btrfs_set_root_node(&root->root_item, root->node); } @@ -852,6 +857,16 @@ static void update_super_roots(struct btrfs_root *root) super->root_level = root_item->level; } +int btrfs_transaction_in_commit(struct btrfs_fs_info *info) +{ + int ret = 0; + spin_lock(&info->new_trans_lock); + if (info->running_transaction) + ret = info->running_transaction->in_commit; + spin_unlock(&info->new_trans_lock); + return ret; +} + int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -943,9 +958,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, mutex_unlock(&root->fs_info->trans_mutex); - if (flush_on_commit || snap_pending) { - if (flush_on_commit) - btrfs_start_delalloc_inodes(root); + if (flush_on_commit) { + btrfs_start_delalloc_inodes(root); + ret = btrfs_wait_ordered_extents(root, 0); + BUG_ON(ret); + } else if (snap_pending) { ret = btrfs_wait_ordered_extents(root, 1); BUG_ON(ret); } @@ -1009,15 +1026,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_set_root_node(&root->fs_info->tree_root->root_item, root->fs_info->tree_root->node); - free_extent_buffer(root->fs_info->tree_root->commit_root); - root->fs_info->tree_root->commit_root = - btrfs_root_node(root->fs_info->tree_root); + switch_commit_root(root->fs_info->tree_root); btrfs_set_root_node(&root->fs_info->chunk_root->root_item, root->fs_info->chunk_root->node); - free_extent_buffer(root->fs_info->chunk_root->commit_root); - root->fs_info->chunk_root->commit_root = - btrfs_root_node(root->fs_info->chunk_root); + switch_commit_root(root->fs_info->chunk_root); update_super_roots(root); @@ -1057,6 +1070,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, cur_trans->commit_done = 1; root->fs_info->last_trans_committed = cur_trans->transid; + wake_up(&cur_trans->commit_wait); put_transaction(cur_trans); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 961c3ee5a2e1..663c67404918 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -107,4 +107,5 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages); +int btrfs_transaction_in_commit(struct btrfs_fs_info *info); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c13922206d1b..d91b0de7c502 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -797,7 +797,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, return -ENOENT; inode = read_one_inode(root, key->objectid); - BUG_ON(!dir); + BUG_ON(!inode); ref_ptr = btrfs_item_ptr_offset(eb, slot); ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3ab80e9cd767..5cf405b0828d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -260,7 +260,7 @@ loop_lock: num_run++; batch_run++; - if (bio_sync(cur)) + if (bio_rw_flagged(cur, BIO_RW_SYNCIO)) num_sync_run++; if (need_resched()) { @@ -721,7 +721,8 @@ error: */ static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, - u64 num_bytes, u64 *start) + u64 num_bytes, u64 *start, + u64 *max_avail) { struct btrfs_key key; struct btrfs_root *root = device->dev_root; @@ -758,9 +759,13 @@ static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, 0, 0); if (ret < 0) goto error; - ret = btrfs_previous_item(root, path, 0, key.type); - if (ret < 0) - goto error; + if (ret > 0) { + ret = btrfs_previous_item(root, path, key.objectid, key.type); + if (ret < 0) + goto error; + if (ret > 0) + start_found = 1; + } l = path->nodes[0]; btrfs_item_key_to_cpu(l, &key, path->slots[0]); while (1) { @@ -803,6 +808,10 @@ no_more_items: if (last_byte < search_start) last_byte = search_start; hole_size = key.offset - last_byte; + + if (hole_size > *max_avail) + *max_avail = hole_size; + if (key.offset > last_byte && hole_size >= num_bytes) { *start = last_byte; @@ -1621,6 +1630,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans, device->fs_devices->total_rw_bytes += diff; device->total_bytes = new_size; + device->disk_total_bytes = new_size; btrfs_clear_space_info_full(device->dev_root->fs_info); return btrfs_update_device(trans, device); @@ -2007,7 +2017,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) goto done; if (ret) { ret = 0; - goto done; + break; } l = path->nodes[0]; @@ -2015,7 +2025,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) btrfs_item_key_to_cpu(l, &key, path->slots[0]); if (key.objectid != device->devid) - goto done; + break; dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); length = btrfs_dev_extent_length(l, dev_extent); @@ -2171,6 +2181,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, max_chunk_size); again: + max_avail = 0; if (!map || map->num_stripes != num_stripes) { kfree(map); map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); @@ -2219,7 +2230,8 @@ again: if (device->in_fs_metadata && avail >= min_free) { ret = find_free_dev_extent(trans, device, - min_free, &dev_offset); + min_free, &dev_offset, + &max_avail); if (ret == 0) { list_move_tail(&device->dev_alloc_list, &private_devs); @@ -2795,26 +2807,6 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, } } - for (i = 0; i > nr; i++) { - struct btrfs_multi_bio *multi; - struct btrfs_bio_stripe *stripe; - int ret; - - length = 1; - ret = btrfs_map_block(map_tree, WRITE, buf[i], - &length, &multi, 0); - BUG_ON(ret); - - stripe = multi->stripes; - for (j = 0; j < multi->num_stripes; j++) { - if (stripe->physical >= physical && - physical < stripe->physical + length) - break; - } - BUG_ON(j >= multi->num_stripes); - kfree(multi); - } - *logical = buf; *naddrs = nr; *stripe_len = map->stripe_len; @@ -2911,7 +2903,7 @@ static noinline int schedule_bio(struct btrfs_root *root, bio->bi_rw |= rw; spin_lock(&device->io_lock); - if (bio_sync(bio)) + if (bio_rw_flagged(bio, BIO_RW_SYNCIO)) pending_bios = &device->pending_sync_bios; else pending_bios = &device->pending_bios; diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index ecfbce836d32..3e2b90eaa239 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -208,7 +208,7 @@ int btrfs_zlib_compress_pages(struct address_space *mapping, *total_in = 0; workspace = find_zlib_workspace(); - if (!workspace) + if (IS_ERR(workspace)) return -1; if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { @@ -366,7 +366,7 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in, char *kaddr; workspace = find_zlib_workspace(); - if (!workspace) + if (IS_ERR(workspace)) return -ENOMEM; data_in = kmap(pages_in[page_in_index]); @@ -547,7 +547,7 @@ int btrfs_zlib_decompress(unsigned char *data_in, return -ENOMEM; workspace = find_zlib_workspace(); - if (!workspace) + if (IS_ERR(workspace)) return -ENOMEM; workspace->inf_strm.next_in = data_in; diff --git a/fs/buffer.c b/fs/buffer.c index a3ef091a45bd..90a98865b0cc 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -281,7 +281,7 @@ static void free_more_memory(void) struct zone *zone; int nid; - wakeup_pdflush(1024); + wakeup_flusher_threads(1024); yield(); for_each_online_node(nid) { @@ -1165,8 +1165,11 @@ void mark_buffer_dirty(struct buffer_head *bh) if (!test_set_buffer_dirty(bh)) { struct page *page = bh->b_page; - if (!TestSetPageDirty(page)) - __set_page_dirty(page, page_mapping(page), 0); + if (!TestSetPageDirty(page)) { + struct address_space *mapping = page_mapping(page); + if (mapping) + __set_page_dirty(page, mapping, 0); + } } } diff --git a/fs/char_dev.c b/fs/char_dev.c index b7c9d5187a75..3cbc57f932d2 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -13,7 +13,6 @@ #include <linux/major.h> #include <linux/errno.h> #include <linux/module.h> -#include <linux/smp_lock.h> #include <linux/seq_file.h> #include <linux/kobject.h> @@ -32,6 +31,7 @@ * - no readahead or I/O queue unplugging required */ struct backing_dev_info directly_mappable_cdev_bdi = { + .name = "char", .capabilities = ( #ifdef CONFIG_MMU /* permit private copies of the data to be taken */ @@ -238,8 +238,10 @@ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count, } /** - * register_chrdev() - Register a major number for character devices. + * __register_chrdev() - create and register a cdev occupying a range of minors * @major: major device number or 0 for dynamic allocation + * @baseminor: first of the requested range of minor numbers + * @count: the number of minor numbers required * @name: name of this range of devices * @fops: file operations associated with this devices * @@ -255,19 +257,17 @@ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count, * /dev. It only helps to keep track of the different owners of devices. If * your module name has only one type of devices it's ok to use e.g. the name * of the module here. - * - * This function registers a range of 256 minor numbers. The first minor number - * is 0. */ -int register_chrdev(unsigned int major, const char *name, - const struct file_operations *fops) +int __register_chrdev(unsigned int major, unsigned int baseminor, + unsigned int count, const char *name, + const struct file_operations *fops) { struct char_device_struct *cd; struct cdev *cdev; char *s; int err = -ENOMEM; - cd = __register_chrdev_region(major, 0, 256, name); + cd = __register_chrdev_region(major, baseminor, count, name); if (IS_ERR(cd)) return PTR_ERR(cd); @@ -281,7 +281,7 @@ int register_chrdev(unsigned int major, const char *name, for (s = strchr(kobject_name(&cdev->kobj),'/'); s; s = strchr(s, '/')) *s = '!'; - err = cdev_add(cdev, MKDEV(cd->major, 0), 256); + err = cdev_add(cdev, MKDEV(cd->major, baseminor), count); if (err) goto out; @@ -291,7 +291,7 @@ int register_chrdev(unsigned int major, const char *name, out: kobject_put(&cdev->kobj); out2: - kfree(__unregister_chrdev_region(cd->major, 0, 256)); + kfree(__unregister_chrdev_region(cd->major, baseminor, count)); return err; } @@ -317,10 +317,23 @@ void unregister_chrdev_region(dev_t from, unsigned count) } } -void unregister_chrdev(unsigned int major, const char *name) +/** + * __unregister_chrdev - unregister and destroy a cdev + * @major: major device number + * @baseminor: first of the range of minor numbers + * @count: the number of minor numbers this cdev is occupying + * @name: name of this range of devices + * + * Unregister and destroy the cdev occupying the region described by + * @major, @baseminor and @count. This function undoes what + * __register_chrdev() did. + */ +void __unregister_chrdev(unsigned int major, unsigned int baseminor, + unsigned int count, const char *name) { struct char_device_struct *cd; - cd = __unregister_chrdev_region(major, 0, 256); + + cd = __unregister_chrdev_region(major, baseminor, count); if (cd && cd->cdev) cdev_del(cd->cdev); kfree(cd); @@ -569,6 +582,6 @@ EXPORT_SYMBOL(cdev_alloc); EXPORT_SYMBOL(cdev_del); EXPORT_SYMBOL(cdev_add); EXPORT_SYMBOL(cdev_index); -EXPORT_SYMBOL(register_chrdev); -EXPORT_SYMBOL(unregister_chrdev); +EXPORT_SYMBOL(__register_chrdev); +EXPORT_SYMBOL(__unregister_chrdev); EXPORT_SYMBOL(directly_mappable_cdev_bdi); diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index 3a9b7a58a51d..145540a316ab 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -1,3 +1,13 @@ +Version 1.60 +------------- +Fix memory leak in reconnect. Fix oops in DFS mount error path. +Set s_maxbytes to smaller (the max that vfs can handle) so that +sendfile will now work over cifs mounts again. Add noforcegid +and noforceuid mount parameters. Fix small mem leak when using +ntlmv2. Fix 2nd mount to same server but with different port to +be allowed (rather than reusing the 1st port) - only when the +user explicitly overrides the port on the 2nd mount. + Version 1.59 ------------ Client uses server inode numbers (which are persistent) rather than @@ -5,7 +15,11 @@ client generated ones by default (mount option "serverino" turned on by default if server supports it). Add forceuid and forcegid mount options (so that when negotiating unix extensions specifying which uid mounted does not immediately force the server's reported -uids to be overridden). Add support for scope moutn parm. +uids to be overridden). Add support for scope mount parm. Improve +hard link detection to use same inode for both. Do not set +read-only dos attribute on directories (for chmod) since Windows +explorer special cases this attribute bit for directories for +a different purpose. Version 1.58 ------------ diff --git a/fs/cifs/README b/fs/cifs/README index ad92921dbde4..79c1a93400be 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -262,11 +262,11 @@ A partial list of the supported mount options follows: mount. domain Set the SMB/CIFS workgroup name prepended to the username during CIFS session establishment - forceuid Set the default uid for inodes based on the uid - passed in. For mounts to servers + forceuid Set the default uid for inodes to the uid + passed in on mount. For mounts to servers which do support the CIFS Unix extensions, such as a properly configured Samba server, the server provides - the uid, gid and mode so this parameter should not be + the uid, gid and mode so this parameter should not be specified unless the server and clients uid and gid numbering differ. If the server and client are in the same domain (e.g. running winbind or nss_ldap) and @@ -278,11 +278,7 @@ A partial list of the supported mount options follows: of existing files will be the uid (gid) of the person who executed the mount (root, except when mount.cifs is configured setuid for user mounts) unless the "uid=" - (gid) mount option is specified. For the uid (gid) of newly - created files and directories, ie files created since - the last mount of the server share, the expected uid - (gid) is cached as long as the inode remains in - memory on the client. Also note that permission + (gid) mount option is specified. Also note that permission checks (authorization checks) on accesses to a file occur at the server, but there are cases in which an administrator may want to restrict at the client as well. For those @@ -290,12 +286,15 @@ A partial list of the supported mount options follows: (such as Windows), permissions can also be checked at the client, and a crude form of client side permission checking can be enabled by specifying file_mode and dir_mode on - the client. Note that the mount.cifs helper must be - at version 1.10 or higher to support specifying the uid - (or gid) in non-numeric form. - forcegid (similar to above but for the groupid instead of uid) + the client. (default) + forcegid (similar to above but for the groupid instead of uid) (default) + noforceuid Fill in file owner information (uid) by requesting it from + the server if possible. With this option, the value given in + the uid= option (on mount) will only be used if the server + can not support returning uids on inodes. + noforcegid (similar to above but for the group owner, gid, instead of uid) uid Set the default uid for inodes, and indicate to the - cifs kernel driver which local user mounted . If the server + cifs kernel driver which local user mounted. If the server supports the unix extensions the default uid is not used to fill in the owner fields of inodes (files) unless the "forceuid" parameter is specified. diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 7f19fefd3d45..42cec2a7c0cf 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -261,6 +261,8 @@ static ssize_t cifs_stats_proc_write(struct file *file, atomic_set(&tcon->num_reads, 0); atomic_set(&tcon->num_oplock_brks, 0); atomic_set(&tcon->num_opens, 0); + atomic_set(&tcon->num_posixopens, 0); + atomic_set(&tcon->num_posixmkdirs, 0); atomic_set(&tcon->num_closes, 0); atomic_set(&tcon->num_deletes, 0); atomic_set(&tcon->num_mkdirs, 0); @@ -347,11 +349,15 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v) atomic_read(&tcon->num_locks), atomic_read(&tcon->num_hardlinks), atomic_read(&tcon->num_symlinks)); - seq_printf(m, "\nOpens: %d Closes: %d" + seq_printf(m, "\nOpens: %d Closes: %d " "Deletes: %d", atomic_read(&tcon->num_opens), atomic_read(&tcon->num_closes), atomic_read(&tcon->num_deletes)); + seq_printf(m, "\nPosix Opens: %d " + "Posix Mkdirs: %d", + atomic_read(&tcon->num_posixopens), + atomic_read(&tcon->num_posixmkdirs)); seq_printf(m, "\nMkdirs: %d Rmdirs: %d", atomic_read(&tcon->num_mkdirs), atomic_read(&tcon->num_rmdirs)); diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 3bb11be8b6a8..606912d8f2a8 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -55,7 +55,7 @@ void cifs_dfs_release_automount_timer(void) * i.e. strips from UNC trailing path that is not part of share * name and fixup missing '\' in the begining of DFS node refferal * if neccessary. - * Returns pointer to share name on success or NULL on error. + * Returns pointer to share name on success or ERR_PTR on error. * Caller is responsible for freeing returned string. */ static char *cifs_get_share_name(const char *node_name) @@ -68,7 +68,7 @@ static char *cifs_get_share_name(const char *node_name) UNC = kmalloc(len+2 /*for term null and additional \ if it's missed */, GFP_KERNEL); if (!UNC) - return NULL; + return ERR_PTR(-ENOMEM); /* get share name and server name */ if (node_name[1] != '\\') { @@ -87,7 +87,7 @@ static char *cifs_get_share_name(const char *node_name) cERROR(1, ("%s: no server name end in node name: %s", __func__, node_name)); kfree(UNC); - return NULL; + return ERR_PTR(-EINVAL); } /* find sharename end */ @@ -133,6 +133,12 @@ char *cifs_compose_mount_options(const char *sb_mountdata, return ERR_PTR(-EINVAL); *devname = cifs_get_share_name(ref->node_name); + if (IS_ERR(*devname)) { + rc = PTR_ERR(*devname); + *devname = NULL; + goto compose_mount_options_err; + } + rc = dns_resolve_server_name_to_ip(*devname, &srvIP); if (rc != 0) { cERROR(1, ("%s: Failed to resolve server part of %s to IP: %d", diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 4a4581cb2b5e..8ec7736ce954 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -86,6 +86,9 @@ struct key_type cifs_spnego_key_type = { /* strlen of ";user=" */ #define USER_KEY_LEN 6 +/* strlen of ";pid=0x" */ +#define PID_KEY_LEN 7 + /* get a key struct with a SPNEGO security blob, suitable for session setup */ struct key * cifs_get_spnego_key(struct cifsSesInfo *sesInfo) @@ -103,7 +106,8 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo) IP_KEY_LEN + INET6_ADDRSTRLEN + MAX_MECH_STR_LEN + UID_KEY_LEN + (sizeof(uid_t) * 2) + - USER_KEY_LEN + strlen(sesInfo->userName) + 1; + USER_KEY_LEN + strlen(sesInfo->userName) + + PID_KEY_LEN + (sizeof(pid_t) * 2) + 1; spnego_key = ERR_PTR(-ENOMEM); description = kzalloc(desc_len, GFP_KERNEL); @@ -121,7 +125,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo) if (server->addr.sockAddr.sin_family == AF_INET) sprintf(dp, "ip4=%pI4", &server->addr.sockAddr.sin_addr); else if (server->addr.sockAddr.sin_family == AF_INET6) - sprintf(dp, "ip6=%pi6", &server->addr.sockAddr6.sin6_addr); + sprintf(dp, "ip6=%pI6", &server->addr.sockAddr6.sin6_addr); else goto out; @@ -141,6 +145,9 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo) dp = description + strlen(description); sprintf(dp, ";user=%s", sesInfo->userName); + dp = description + strlen(description); + sprintf(dp, ";pid=0x%x", current->pid); + cFYI(1, ("key description = %s", description)); spnego_key = request_key(&cifs_spnego_key_type, description, ""); diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index 60e3c4253de0..714a542cbafc 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -44,7 +44,7 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes, int maxwords = maxbytes / 2; char tmp[NLS_MAX_CHARSET_SIZE]; - for (i = 0; from[i] && i < maxwords; i++) { + for (i = 0; i < maxwords && from[i]; i++) { charlen = codepage->uni2char(le16_to_cpu(from[i]), tmp, NLS_MAX_CHARSET_SIZE); if (charlen > 0) diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 1403b5d86a73..7dfe0842a6f6 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -327,7 +327,7 @@ static void dump_ace(struct cifs_ace *pace, char *end_of_acl) static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl, struct cifs_sid *pownersid, struct cifs_sid *pgrpsid, - struct inode *inode) + struct cifs_fattr *fattr) { int i; int num_aces = 0; @@ -340,7 +340,7 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl, if (!pdacl) { /* no DACL in the security descriptor, set all the permissions for user/group/other */ - inode->i_mode |= S_IRWXUGO; + fattr->cf_mode |= S_IRWXUGO; return; } @@ -357,7 +357,7 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl, /* reset rwx permissions for user/group/other. Also, if num_aces is 0 i.e. DACL has no ACEs, user/group/other have no permissions */ - inode->i_mode &= ~(S_IRWXUGO); + fattr->cf_mode &= ~(S_IRWXUGO); acl_base = (char *)pdacl; acl_size = sizeof(struct cifs_acl); @@ -379,17 +379,17 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl, if (compare_sids(&(ppace[i]->sid), pownersid)) access_flags_to_mode(ppace[i]->access_req, ppace[i]->type, - &(inode->i_mode), + &fattr->cf_mode, &user_mask); if (compare_sids(&(ppace[i]->sid), pgrpsid)) access_flags_to_mode(ppace[i]->access_req, ppace[i]->type, - &(inode->i_mode), + &fattr->cf_mode, &group_mask); if (compare_sids(&(ppace[i]->sid), &sid_everyone)) access_flags_to_mode(ppace[i]->access_req, ppace[i]->type, - &(inode->i_mode), + &fattr->cf_mode, &other_mask); /* memcpy((void *)(&(cifscred->aces[i])), @@ -464,7 +464,7 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl) /* Convert CIFS ACL to POSIX form */ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len, - struct inode *inode) + struct cifs_fattr *fattr) { int rc; struct cifs_sid *owner_sid_ptr, *group_sid_ptr; @@ -472,7 +472,7 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len, char *end_of_acl = ((char *)pntsd) + acl_len; __u32 dacloffset; - if ((inode == NULL) || (pntsd == NULL)) + if (pntsd == NULL) return -EIO; owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + @@ -497,7 +497,7 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len, if (dacloffset) parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr, - group_sid_ptr, inode); + group_sid_ptr, fattr); else cFYI(1, ("no ACL")); /* BB grant all or default perms? */ @@ -508,7 +508,6 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len, memcpy((void *)(&(cifscred->gsid)), (void *)group_sid_ptr, sizeof(struct cifs_sid)); */ - return 0; } @@ -608,7 +607,7 @@ static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb, return get_cifs_acl_by_path(cifs_sb, path, pacllen); pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->netfid, pacllen); - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); return pntsd; } @@ -666,13 +665,14 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen); rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen); - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); return rc; } /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */ -void acl_to_uid_mode(struct cifs_sb_info *cifs_sb, struct inode *inode, - const char *path, const __u16 *pfid) +void +cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, + struct inode *inode, const char *path, const __u16 *pfid) { struct cifs_ntsd *pntsd = NULL; u32 acllen = 0; @@ -687,7 +687,7 @@ void acl_to_uid_mode(struct cifs_sb_info *cifs_sb, struct inode *inode, /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */ if (pntsd) - rc = parse_sec_desc(pntsd, acllen, inode); + rc = parse_sec_desc(pntsd, acllen, fattr); if (rc) cFYI(1, ("parse sec desc failed rc = %d", rc)); diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 7c9809523f42..7efe1745494d 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -373,6 +373,7 @@ calc_exit_2: compare with the NTLM example */ hmac_md5_final(ses->server->ntlmv2_hash, pctxt); + kfree(pctxt); return rc; } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 9f669f982c4d..3610e9958b4c 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -308,7 +308,6 @@ cifs_alloc_inode(struct super_block *sb) if (!cifs_inode) return NULL; cifs_inode->cifsAttrs = 0x20; /* default */ - atomic_set(&cifs_inode->inUse, 0); cifs_inode->time = 0; cifs_inode->write_behind_rc = 0; /* Until the file is open and we have gotten oplock @@ -362,13 +361,10 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server) static int cifs_show_options(struct seq_file *s, struct vfsmount *m) { - struct cifs_sb_info *cifs_sb; - struct cifsTconInfo *tcon; - - cifs_sb = CIFS_SB(m->mnt_sb); - tcon = cifs_sb->tcon; + struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb); + struct cifsTconInfo *tcon = cifs_sb->tcon; - seq_printf(s, ",unc=%s", cifs_sb->tcon->treeName); + seq_printf(s, ",unc=%s", tcon->treeName); if (tcon->ses->userName) seq_printf(s, ",username=%s", tcon->ses->userName); if (tcon->ses->domainName) @@ -377,10 +373,14 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m) seq_printf(s, ",uid=%d", cifs_sb->mnt_uid); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) seq_printf(s, ",forceuid"); + else + seq_printf(s, ",noforceuid"); seq_printf(s, ",gid=%d", cifs_sb->mnt_gid); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) seq_printf(s, ",forcegid"); + else + seq_printf(s, ",noforcegid"); cifs_show_address(s, tcon->ses->server); @@ -986,19 +986,19 @@ static int cifs_oplock_thread(void *dummyarg) if (try_to_freeze()) continue; - spin_lock(&GlobalMid_Lock); - if (list_empty(&GlobalOplock_Q)) { - spin_unlock(&GlobalMid_Lock); + spin_lock(&cifs_oplock_lock); + if (list_empty(&cifs_oplock_list)) { + spin_unlock(&cifs_oplock_lock); set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(39*HZ); } else { - oplock_item = list_entry(GlobalOplock_Q.next, + oplock_item = list_entry(cifs_oplock_list.next, struct oplock_q_entry, qhead); cFYI(1, ("found oplock item to write out")); pTcon = oplock_item->tcon; inode = oplock_item->pinode; netfid = oplock_item->netfid; - spin_unlock(&GlobalMid_Lock); + spin_unlock(&cifs_oplock_lock); DeleteOplockQEntry(oplock_item); /* can not grab inode sem here since it would deadlock when oplock received on delete @@ -1055,7 +1055,7 @@ init_cifs(void) int rc = 0; cifs_proc_init(); INIT_LIST_HEAD(&cifs_tcp_ses_list); - INIT_LIST_HEAD(&GlobalOplock_Q); + INIT_LIST_HEAD(&cifs_oplock_list); #ifdef CONFIG_CIFS_EXPERIMENTAL INIT_LIST_HEAD(&GlobalDnotifyReqList); INIT_LIST_HEAD(&GlobalDnotifyRsp_Q); @@ -1084,6 +1084,7 @@ init_cifs(void) rwlock_init(&GlobalSMBSeslock); rwlock_init(&cifs_tcp_ses_lock); spin_lock_init(&GlobalMid_Lock); + spin_lock_init(&cifs_oplock_lock); if (cifs_max_pending < 2) { cifs_max_pending = 2; diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 9570a0e8023f..094325e3f714 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -24,6 +24,19 @@ #define ROOT_I 2 +/* + * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down + * so that it will fit. + */ +static inline ino_t +cifs_uniqueid_to_ino_t(u64 fileid) +{ + ino_t ino = (ino_t) fileid; + if (sizeof(ino_t) < sizeof(u64)) + ino ^= fileid >> (sizeof(u64)-sizeof(ino_t)) * 8; + return ino; +} + extern struct file_system_type cifs_fs_type; extern const struct address_space_operations cifs_addr_ops; extern const struct address_space_operations cifs_addr_ops_smallbuf; @@ -100,5 +113,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* EXPERIMENTAL */ -#define CIFS_VERSION "1.59" +#define CIFS_VERSION "1.61" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index e1225e6ded2f..6cfc81a32703 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -260,6 +260,8 @@ struct cifsTconInfo { atomic_t num_closes; atomic_t num_deletes; atomic_t num_mkdirs; + atomic_t num_posixopens; + atomic_t num_posixmkdirs; atomic_t num_rmdirs; atomic_t num_renames; atomic_t num_t2renames; @@ -349,11 +351,24 @@ struct cifsFileInfo { bool closePend:1; /* file is marked to close */ bool invalidHandle:1; /* file closed via session abend */ bool messageMode:1; /* for pipes: message vs byte mode */ - atomic_t wrtPending; /* handle in use - defer close */ + atomic_t count; /* reference count */ struct mutex fh_mutex; /* prevents reopen race after dead ses*/ struct cifs_search_info srch_inf; }; +/* Take a reference on the file private data */ +static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file) +{ + atomic_inc(&cifs_file->count); +} + +/* Release a reference on the file private data */ +static inline void cifsFileInfo_put(struct cifsFileInfo *cifs_file) +{ + if (atomic_dec_and_test(&cifs_file->count)) + kfree(cifs_file); +} + /* * One of these for each file inode */ @@ -364,13 +379,13 @@ struct cifsInodeInfo { struct list_head openFileList; int write_behind_rc; __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */ - atomic_t inUse; /* num concurrent users (local openers cifs) of file*/ unsigned long time; /* jiffies of last update/check of inode */ bool clientCanCacheRead:1; /* read oplock */ bool clientCanCacheAll:1; /* read and writebehind oplock */ bool oplockPending:1; bool delete_pending:1; /* DELETE_ON_CLOSE is set */ u64 server_eof; /* current file size on server */ + u64 uniqueid; /* server inode number */ struct inode vfs_inode; }; @@ -472,6 +487,32 @@ struct dfs_info3_param { char *node_name; }; +/* + * common struct for holding inode info when searching for or updating an + * inode with new info + */ + +#define CIFS_FATTR_DFS_REFERRAL 0x1 +#define CIFS_FATTR_DELETE_PENDING 0x2 +#define CIFS_FATTR_NEED_REVAL 0x4 + +struct cifs_fattr { + u32 cf_flags; + u32 cf_cifsattrs; + u64 cf_uniqueid; + u64 cf_eof; + u64 cf_bytes; + uid_t cf_uid; + gid_t cf_gid; + umode_t cf_mode; + dev_t cf_rdev; + unsigned int cf_nlink; + unsigned int cf_dtype; + struct timespec cf_atime; + struct timespec cf_mtime; + struct timespec cf_ctime; +}; + static inline void free_dfs_info_param(struct dfs_info3_param *param) { if (param) { @@ -628,7 +669,11 @@ GLOBAL_EXTERN rwlock_t cifs_tcp_ses_lock; */ GLOBAL_EXTERN rwlock_t GlobalSMBSeslock; -GLOBAL_EXTERN struct list_head GlobalOplock_Q; +/* Global list of oplocks */ +GLOBAL_EXTERN struct list_head cifs_oplock_list; + +/* Protects the cifs_oplock_list */ +GLOBAL_EXTERN spinlock_t cifs_oplock_lock; /* Outstanding dir notify requests */ GLOBAL_EXTERN struct list_head GlobalDnotifyReqList; diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index a785f69dbc9f..2d07f890a842 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -2328,19 +2328,7 @@ struct file_attrib_tag { typedef struct { __le32 NextEntryOffset; __u32 ResumeKey; /* as with FileIndex - no need to convert */ - __le64 EndOfFile; - __le64 NumOfBytes; - __le64 LastStatusChange; /*SNIA specs DCE time for the 3 time fields */ - __le64 LastAccessTime; - __le64 LastModificationTime; - __le64 Uid; - __le64 Gid; - __le32 Type; - __le64 DevMajor; - __le64 DevMinor; - __le64 UniqueId; - __le64 Permissions; - __le64 Nlinks; + FILE_UNIX_BASIC_INFO basic; char FileName[1]; } __attribute__((packed)) FILE_UNIX_INFO; /* level 0x202 */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index c419416a42ee..da8fbf565991 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -98,9 +98,13 @@ extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, extern int cifs_posix_open(char *full_path, struct inode **pinode, struct super_block *sb, int mode, int oflags, int *poplock, __u16 *pnetfid, int xid); -extern void posix_fill_in_inode(struct inode *tmp_inode, - FILE_UNIX_BASIC_INFO *pData, int isNewInode); -extern struct inode *cifs_new_inode(struct super_block *sb, __u64 *inum); +extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, + FILE_UNIX_BASIC_INFO *info, + struct cifs_sb_info *cifs_sb); +extern void cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr); +extern struct inode *cifs_iget(struct super_block *sb, + struct cifs_fattr *fattr); + extern int cifs_get_inode_info(struct inode **pinode, const unsigned char *search_path, FILE_ALL_INFO *pfile_info, @@ -108,8 +112,9 @@ extern int cifs_get_inode_info(struct inode **pinode, extern int cifs_get_inode_info_unix(struct inode **pinode, const unsigned char *search_path, struct super_block *sb, int xid); -extern void acl_to_uid_mode(struct cifs_sb_info *cifs_sb, struct inode *inode, - const char *path, const __u16 *pfid); +extern void cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, + struct cifs_fattr *fattr, struct inode *inode, + const char *path, const __u16 *pfid); extern int mode_to_acl(struct inode *inode, const char *path, __u64); extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *, @@ -215,7 +220,11 @@ struct cifs_unix_set_info_args { dev_t device; }; -extern int CIFSSMBUnixSetInfo(const int xid, struct cifsTconInfo *pTcon, +extern int CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon, + const struct cifs_unix_set_info_args *args, + u16 fid, u32 pid_of_opener); + +extern int CIFSSMBUnixSetPathInfo(const int xid, struct cifsTconInfo *pTcon, char *fileName, const struct cifs_unix_set_info_args *args, const struct nls_table *nls_codepage, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 61007c627497..301e307e1279 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -100,110 +100,138 @@ static void mark_open_files_invalid(struct cifsTconInfo *pTcon) to this tcon */ } -/* Allocate and return pointer to an SMB request buffer, and set basic - SMB information in the SMB header. If the return code is zero, this - function must have filled in request_buf pointer */ +/* reconnect the socket, tcon, and smb session if needed */ static int -small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, - void **request_buf) +cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command) { int rc = 0; + struct cifsSesInfo *ses; + struct TCP_Server_Info *server; + struct nls_table *nls_codepage; - /* SMBs NegProt, SessSetup, uLogoff do not have tcon yet so - check for tcp and smb session status done differently - for those three - in the calling routine */ - if (tcon) { - if (tcon->tidStatus == CifsExiting) { - /* only tree disconnect, open, and write, - (and ulogoff which does not have tcon) - are allowed as we start force umount */ - if ((smb_command != SMB_COM_WRITE_ANDX) && - (smb_command != SMB_COM_OPEN_ANDX) && - (smb_command != SMB_COM_TREE_DISCONNECT)) { - cFYI(1, ("can not send cmd %d while umounting", - smb_command)); - return -ENODEV; - } + /* + * SMBs NegProt, SessSetup, uLogoff do not have tcon yet so check for + * tcp and smb session status done differently for those three - in the + * calling routine + */ + if (!tcon) + return 0; + + ses = tcon->ses; + server = ses->server; + + /* + * only tree disconnect, open, and write, (and ulogoff which does not + * have tcon) are allowed as we start force umount + */ + if (tcon->tidStatus == CifsExiting) { + if (smb_command != SMB_COM_WRITE_ANDX && + smb_command != SMB_COM_OPEN_ANDX && + smb_command != SMB_COM_TREE_DISCONNECT) { + cFYI(1, ("can not send cmd %d while umounting", + smb_command)); + return -ENODEV; } - if ((tcon->ses) && (tcon->ses->status != CifsExiting) && - (tcon->ses->server)) { - struct nls_table *nls_codepage; - /* Give Demultiplex thread up to 10 seconds to - reconnect, should be greater than cifs socket - timeout which is 7 seconds */ - while (tcon->ses->server->tcpStatus == - CifsNeedReconnect) { - wait_event_interruptible_timeout(tcon->ses->server->response_q, - (tcon->ses->server->tcpStatus == - CifsGood), 10 * HZ); - if (tcon->ses->server->tcpStatus == - CifsNeedReconnect) { - /* on "soft" mounts we wait once */ - if (!tcon->retry || - (tcon->ses->status == CifsExiting)) { - cFYI(1, ("gave up waiting on " - "reconnect in smb_init")); - return -EHOSTDOWN; - } /* else "hard" mount - keep retrying - until process is killed or server - comes back on-line */ - } else /* TCP session is reestablished now */ - break; - } + } - nls_codepage = load_nls_default(); - /* need to prevent multiple threads trying to - simultaneously reconnect the same SMB session */ - down(&tcon->ses->sesSem); - if (tcon->ses->need_reconnect) - rc = cifs_setup_session(0, tcon->ses, - nls_codepage); - if (!rc && (tcon->need_reconnect)) { - mark_open_files_invalid(tcon); - rc = CIFSTCon(0, tcon->ses, tcon->treeName, - tcon, nls_codepage); - up(&tcon->ses->sesSem); - /* BB FIXME add code to check if wsize needs - update due to negotiated smb buffer size - shrinking */ - if (rc == 0) { - atomic_inc(&tconInfoReconnectCount); - /* tell server Unix caps we support */ - if (tcon->ses->capabilities & CAP_UNIX) - reset_cifs_unix_caps( - 0 /* no xid */, - tcon, - NULL /* we do not know sb */, - NULL /* no vol info */); - } + if (ses->status == CifsExiting) + return -EIO; - cFYI(1, ("reconnect tcon rc = %d", rc)); - /* Removed call to reopen open files here. - It is safer (and faster) to reopen files - one at a time as needed in read and write */ - - /* Check if handle based operation so we - know whether we can continue or not without - returning to caller to reset file handle */ - switch (smb_command) { - case SMB_COM_READ_ANDX: - case SMB_COM_WRITE_ANDX: - case SMB_COM_CLOSE: - case SMB_COM_FIND_CLOSE2: - case SMB_COM_LOCKING_ANDX: { - unload_nls(nls_codepage); - return -EAGAIN; - } - } - } else { - up(&tcon->ses->sesSem); - } - unload_nls(nls_codepage); + /* + * Give demultiplex thread up to 10 seconds to reconnect, should be + * greater than cifs socket timeout which is 7 seconds + */ + while (server->tcpStatus == CifsNeedReconnect) { + wait_event_interruptible_timeout(server->response_q, + (server->tcpStatus == CifsGood), 10 * HZ); - } else { - return -EIO; + /* is TCP session is reestablished now ?*/ + if (server->tcpStatus != CifsNeedReconnect) + break; + + /* + * on "soft" mounts we wait once. Hard mounts keep + * retrying until process is killed or server comes + * back on-line + */ + if (!tcon->retry || ses->status == CifsExiting) { + cFYI(1, ("gave up waiting on reconnect in smb_init")); + return -EHOSTDOWN; } } + + if (!ses->need_reconnect && !tcon->need_reconnect) + return 0; + + nls_codepage = load_nls_default(); + + /* + * need to prevent multiple threads trying to simultaneously + * reconnect the same SMB session + */ + down(&ses->sesSem); + if (ses->need_reconnect) + rc = cifs_setup_session(0, ses, nls_codepage); + + /* do we need to reconnect tcon? */ + if (rc || !tcon->need_reconnect) { + up(&ses->sesSem); + goto out; + } + + mark_open_files_invalid(tcon); + rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage); + up(&ses->sesSem); + cFYI(1, ("reconnect tcon rc = %d", rc)); + + if (rc) + goto out; + + /* + * FIXME: check if wsize needs updated due to negotiated smb buffer + * size shrinking + */ + atomic_inc(&tconInfoReconnectCount); + + /* tell server Unix caps we support */ + if (ses->capabilities & CAP_UNIX) + reset_cifs_unix_caps(0, tcon, NULL, NULL); + + /* + * Removed call to reopen open files here. It is safer (and faster) to + * reopen files one at a time as needed in read and write. + * + * FIXME: what about file locks? don't we need to reclaim them ASAP? + */ + +out: + /* + * Check if handle based operation so we know whether we can continue + * or not without returning to caller to reset file handle + */ + switch (smb_command) { + case SMB_COM_READ_ANDX: + case SMB_COM_WRITE_ANDX: + case SMB_COM_CLOSE: + case SMB_COM_FIND_CLOSE2: + case SMB_COM_LOCKING_ANDX: + rc = -EAGAIN; + } + + unload_nls(nls_codepage); + return rc; +} + +/* Allocate and return pointer to an SMB request buffer, and set basic + SMB information in the SMB header. If the return code is zero, this + function must have filled in request_buf pointer */ +static int +small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, + void **request_buf) +{ + int rc = 0; + + rc = cifs_reconnect_tcon(tcon, smb_command); if (rc) return rc; @@ -256,101 +284,7 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, { int rc = 0; - /* SMBs NegProt, SessSetup, uLogoff do not have tcon yet so - check for tcp and smb session status done differently - for those three - in the calling routine */ - if (tcon) { - if (tcon->tidStatus == CifsExiting) { - /* only tree disconnect, open, and write, - (and ulogoff which does not have tcon) - are allowed as we start force umount */ - if ((smb_command != SMB_COM_WRITE_ANDX) && - (smb_command != SMB_COM_OPEN_ANDX) && - (smb_command != SMB_COM_TREE_DISCONNECT)) { - cFYI(1, ("can not send cmd %d while umounting", - smb_command)); - return -ENODEV; - } - } - - if ((tcon->ses) && (tcon->ses->status != CifsExiting) && - (tcon->ses->server)) { - struct nls_table *nls_codepage; - /* Give Demultiplex thread up to 10 seconds to - reconnect, should be greater than cifs socket - timeout which is 7 seconds */ - while (tcon->ses->server->tcpStatus == - CifsNeedReconnect) { - wait_event_interruptible_timeout(tcon->ses->server->response_q, - (tcon->ses->server->tcpStatus == - CifsGood), 10 * HZ); - if (tcon->ses->server->tcpStatus == - CifsNeedReconnect) { - /* on "soft" mounts we wait once */ - if (!tcon->retry || - (tcon->ses->status == CifsExiting)) { - cFYI(1, ("gave up waiting on " - "reconnect in smb_init")); - return -EHOSTDOWN; - } /* else "hard" mount - keep retrying - until process is killed or server - comes on-line */ - } else /* TCP session is reestablished now */ - break; - } - nls_codepage = load_nls_default(); - /* need to prevent multiple threads trying to - simultaneously reconnect the same SMB session */ - down(&tcon->ses->sesSem); - if (tcon->ses->need_reconnect) - rc = cifs_setup_session(0, tcon->ses, - nls_codepage); - if (!rc && (tcon->need_reconnect)) { - mark_open_files_invalid(tcon); - rc = CIFSTCon(0, tcon->ses, tcon->treeName, - tcon, nls_codepage); - up(&tcon->ses->sesSem); - /* BB FIXME add code to check if wsize needs - update due to negotiated smb buffer size - shrinking */ - if (rc == 0) { - atomic_inc(&tconInfoReconnectCount); - /* tell server Unix caps we support */ - if (tcon->ses->capabilities & CAP_UNIX) - reset_cifs_unix_caps( - 0 /* no xid */, - tcon, - NULL /* do not know sb */, - NULL /* no vol info */); - } - - cFYI(1, ("reconnect tcon rc = %d", rc)); - /* Removed call to reopen open files here. - It is safer (and faster) to reopen files - one at a time as needed in read and write */ - - /* Check if handle based operation so we - know whether we can continue or not without - returning to caller to reset file handle */ - switch (smb_command) { - case SMB_COM_READ_ANDX: - case SMB_COM_WRITE_ANDX: - case SMB_COM_CLOSE: - case SMB_COM_FIND_CLOSE2: - case SMB_COM_LOCKING_ANDX: { - unload_nls(nls_codepage); - return -EAGAIN; - } - } - } else { - up(&tcon->ses->sesSem); - } - unload_nls(nls_codepage); - - } else { - return -EIO; - } - } + rc = cifs_reconnect_tcon(tcon, smb_command); if (rc) return rc; @@ -1113,7 +1047,10 @@ PsxCreat: psx_create_err: cifs_buf_release(pSMB); - cifs_stats_inc(&tcon->num_mkdirs); + if (posix_flags & SMB_O_DIRECTORY) + cifs_stats_inc(&tcon->num_posixmkdirs); + else + cifs_stats_inc(&tcon->num_posixopens); if (rc == -EAGAIN) goto PsxCreat; @@ -3958,6 +3895,10 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr, if (is_unicode) { __le16 *tmp = kmalloc(strlen(searchName)*2 + 2, GFP_KERNEL); + if (tmp == NULL) { + rc = -ENOMEM; + goto parse_DFS_referrals_exit; + } cifsConvertToUCS((__le16 *) tmp, searchName, PATH_MAX, nls_codepage, remap); node->path_consumed = cifs_ucs2_bytes(tmp, @@ -5074,10 +5015,114 @@ SetAttrLgcyRetry: } #endif /* temporarily unneeded SetAttr legacy function */ +static void +cifs_fill_unix_set_info(FILE_UNIX_BASIC_INFO *data_offset, + const struct cifs_unix_set_info_args *args) +{ + u64 mode = args->mode; + + /* + * Samba server ignores set of file size to zero due to bugs in some + * older clients, but we should be precise - we use SetFileSize to + * set file size and do not want to truncate file size to zero + * accidently as happened on one Samba server beta by putting + * zero instead of -1 here + */ + data_offset->EndOfFile = cpu_to_le64(NO_CHANGE_64); + data_offset->NumOfBytes = cpu_to_le64(NO_CHANGE_64); + data_offset->LastStatusChange = cpu_to_le64(args->ctime); + data_offset->LastAccessTime = cpu_to_le64(args->atime); + data_offset->LastModificationTime = cpu_to_le64(args->mtime); + data_offset->Uid = cpu_to_le64(args->uid); + data_offset->Gid = cpu_to_le64(args->gid); + /* better to leave device as zero when it is */ + data_offset->DevMajor = cpu_to_le64(MAJOR(args->device)); + data_offset->DevMinor = cpu_to_le64(MINOR(args->device)); + data_offset->Permissions = cpu_to_le64(mode); + + if (S_ISREG(mode)) + data_offset->Type = cpu_to_le32(UNIX_FILE); + else if (S_ISDIR(mode)) + data_offset->Type = cpu_to_le32(UNIX_DIR); + else if (S_ISLNK(mode)) + data_offset->Type = cpu_to_le32(UNIX_SYMLINK); + else if (S_ISCHR(mode)) + data_offset->Type = cpu_to_le32(UNIX_CHARDEV); + else if (S_ISBLK(mode)) + data_offset->Type = cpu_to_le32(UNIX_BLOCKDEV); + else if (S_ISFIFO(mode)) + data_offset->Type = cpu_to_le32(UNIX_FIFO); + else if (S_ISSOCK(mode)) + data_offset->Type = cpu_to_le32(UNIX_SOCKET); +} + int -CIFSSMBUnixSetInfo(const int xid, struct cifsTconInfo *tcon, char *fileName, - const struct cifs_unix_set_info_args *args, - const struct nls_table *nls_codepage, int remap) +CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon, + const struct cifs_unix_set_info_args *args, + u16 fid, u32 pid_of_opener) +{ + struct smb_com_transaction2_sfi_req *pSMB = NULL; + FILE_UNIX_BASIC_INFO *data_offset; + int rc = 0; + u16 params, param_offset, offset, byte_count, count; + + cFYI(1, ("Set Unix Info (via SetFileInfo)")); + rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); + + if (rc) + return rc; + + pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener); + pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16)); + + params = 6; + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4; + offset = param_offset + params; + + data_offset = (FILE_UNIX_BASIC_INFO *) + ((char *)(&pSMB->hdr.Protocol) + offset); + count = sizeof(FILE_UNIX_BASIC_INFO); + + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find max SMB PDU from sess */ + pSMB->MaxDataCount = cpu_to_le16(1000); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION); + byte_count = 3 /* pad */ + params + count; + pSMB->DataCount = cpu_to_le16(count); + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->ParameterOffset = cpu_to_le16(param_offset); + pSMB->DataOffset = cpu_to_le16(offset); + pSMB->Fid = fid; + pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_BASIC); + pSMB->Reserved4 = 0; + pSMB->hdr.smb_buf_length += byte_count; + pSMB->ByteCount = cpu_to_le16(byte_count); + + cifs_fill_unix_set_info(data_offset, args); + + rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); + if (rc) + cFYI(1, ("Send error in Set Time (SetFileInfo) = %d", rc)); + + /* Note: On -EAGAIN error only caller can retry on handle based calls + since file handle passed in no longer valid */ + + return rc; +} + +int +CIFSSMBUnixSetPathInfo(const int xid, struct cifsTconInfo *tcon, char *fileName, + const struct cifs_unix_set_info_args *args, + const struct nls_table *nls_codepage, int remap) { TRANSACTION2_SPI_REQ *pSMB = NULL; TRANSACTION2_SPI_RSP *pSMBr = NULL; @@ -5086,7 +5131,6 @@ CIFSSMBUnixSetInfo(const int xid, struct cifsTconInfo *tcon, char *fileName, int bytes_returned = 0; FILE_UNIX_BASIC_INFO *data_offset; __u16 params, param_offset, offset, count, byte_count; - __u64 mode = args->mode; cFYI(1, ("In SetUID/GID/Mode")); setPermsRetry: @@ -5137,38 +5181,8 @@ setPermsRetry: pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_BASIC); pSMB->Reserved4 = 0; pSMB->hdr.smb_buf_length += byte_count; - /* Samba server ignores set of file size to zero due to bugs in some - older clients, but we should be precise - we use SetFileSize to - set file size and do not want to truncate file size to zero - accidently as happened on one Samba server beta by putting - zero instead of -1 here */ - data_offset->EndOfFile = cpu_to_le64(NO_CHANGE_64); - data_offset->NumOfBytes = cpu_to_le64(NO_CHANGE_64); - data_offset->LastStatusChange = cpu_to_le64(args->ctime); - data_offset->LastAccessTime = cpu_to_le64(args->atime); - data_offset->LastModificationTime = cpu_to_le64(args->mtime); - data_offset->Uid = cpu_to_le64(args->uid); - data_offset->Gid = cpu_to_le64(args->gid); - /* better to leave device as zero when it is */ - data_offset->DevMajor = cpu_to_le64(MAJOR(args->device)); - data_offset->DevMinor = cpu_to_le64(MINOR(args->device)); - data_offset->Permissions = cpu_to_le64(mode); - - if (S_ISREG(mode)) - data_offset->Type = cpu_to_le32(UNIX_FILE); - else if (S_ISDIR(mode)) - data_offset->Type = cpu_to_le32(UNIX_DIR); - else if (S_ISLNK(mode)) - data_offset->Type = cpu_to_le32(UNIX_SYMLINK); - else if (S_ISCHR(mode)) - data_offset->Type = cpu_to_le32(UNIX_CHARDEV); - else if (S_ISBLK(mode)) - data_offset->Type = cpu_to_le32(UNIX_BLOCKDEV); - else if (S_ISFIFO(mode)) - data_offset->Type = cpu_to_le32(UNIX_FIFO); - else if (S_ISSOCK(mode)) - data_offset->Type = cpu_to_le32(UNIX_SOCKET); + cifs_fill_unix_set_info(data_offset, args); pSMB->ByteCount = cpu_to_le16(byte_count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index e16d7592116a..d49682433c20 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -803,6 +803,10 @@ cifs_parse_mount_options(char *options, const char *devname, char *data; unsigned int temp_len, i, j; char separator[2]; + short int override_uid = -1; + short int override_gid = -1; + bool uid_specified = false; + bool gid_specified = false; separator[0] = ','; separator[1] = 0; @@ -1093,18 +1097,20 @@ cifs_parse_mount_options(char *options, const char *devname, "too long.\n"); return 1; } - } else if (strnicmp(data, "uid", 3) == 0) { - if (value && *value) - vol->linux_uid = - simple_strtoul(value, &value, 0); - } else if (strnicmp(data, "forceuid", 8) == 0) { - vol->override_uid = 1; - } else if (strnicmp(data, "gid", 3) == 0) { - if (value && *value) - vol->linux_gid = - simple_strtoul(value, &value, 0); - } else if (strnicmp(data, "forcegid", 8) == 0) { - vol->override_gid = 1; + } else if (!strnicmp(data, "uid", 3) && value && *value) { + vol->linux_uid = simple_strtoul(value, &value, 0); + uid_specified = true; + } else if (!strnicmp(data, "forceuid", 8)) { + override_uid = 1; + } else if (!strnicmp(data, "noforceuid", 10)) { + override_uid = 0; + } else if (!strnicmp(data, "gid", 3) && value && *value) { + vol->linux_gid = simple_strtoul(value, &value, 0); + gid_specified = true; + } else if (!strnicmp(data, "forcegid", 8)) { + override_gid = 1; + } else if (!strnicmp(data, "noforcegid", 10)) { + override_gid = 0; } else if (strnicmp(data, "file_mode", 4) == 0) { if (value && *value) { vol->file_mode = @@ -1355,11 +1361,23 @@ cifs_parse_mount_options(char *options, const char *devname, if (vol->UNCip == NULL) vol->UNCip = &vol->UNC[2]; + if (uid_specified) + vol->override_uid = override_uid; + else if (override_uid == 1) + printk(KERN_NOTICE "CIFS: ignoring forceuid mount option " + "specified with no uid= option.\n"); + + if (gid_specified) + vol->override_gid = override_gid; + else if (override_gid == 1) + printk(KERN_NOTICE "CIFS: ignoring forcegid mount option " + "specified with no gid= option.\n"); + return 0; } static struct TCP_Server_Info * -cifs_find_tcp_session(struct sockaddr_storage *addr) +cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port) { struct list_head *tmp; struct TCP_Server_Info *server; @@ -1379,16 +1397,37 @@ cifs_find_tcp_session(struct sockaddr_storage *addr) if (server->tcpStatus == CifsNew) continue; - if (addr->ss_family == AF_INET && - (addr4->sin_addr.s_addr != - server->addr.sockAddr.sin_addr.s_addr)) - continue; - else if (addr->ss_family == AF_INET6 && - (!ipv6_addr_equal(&server->addr.sockAddr6.sin6_addr, - &addr6->sin6_addr) || - server->addr.sockAddr6.sin6_scope_id != - addr6->sin6_scope_id)) - continue; + switch (addr->ss_family) { + case AF_INET: + if (addr4->sin_addr.s_addr == + server->addr.sockAddr.sin_addr.s_addr) { + addr4->sin_port = htons(port); + /* user overrode default port? */ + if (addr4->sin_port) { + if (addr4->sin_port != + server->addr.sockAddr.sin_port) + continue; + } + break; + } else + continue; + + case AF_INET6: + if (ipv6_addr_equal(&addr6->sin6_addr, + &server->addr.sockAddr6.sin6_addr) && + (addr6->sin6_scope_id == + server->addr.sockAddr6.sin6_scope_id)) { + addr6->sin6_port = htons(port); + /* user overrode default port? */ + if (addr6->sin6_port) { + if (addr6->sin6_port != + server->addr.sockAddr6.sin6_port) + continue; + } + break; + } else + continue; + } ++server->srv_count; write_unlock(&cifs_tcp_ses_lock); @@ -1457,7 +1496,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) } /* see if we already have a matching tcp_ses */ - tcp_ses = cifs_find_tcp_session(&addr); + tcp_ses = cifs_find_tcp_session(&addr, volume_info->port); if (tcp_ses) return tcp_ses; @@ -2452,10 +2491,10 @@ try_mount_again: tcon->local_lease = volume_info->local_lease; } if (pSesInfo) { - if (pSesInfo->capabilities & CAP_LARGE_FILES) { - sb->s_maxbytes = (u64) 1 << 63; - } else - sb->s_maxbytes = (u64) 1 << 31; /* 2 GB */ + if (pSesInfo->capabilities & CAP_LARGE_FILES) + sb->s_maxbytes = MAX_LFS_FILESIZE; + else + sb->s_maxbytes = MAX_NON_LFS; } /* BB FIXME fix time_gran to be larger for LANMAN sessions */ @@ -2544,11 +2583,20 @@ remote_path_check: if (mount_data != mount_data_global) kfree(mount_data); + mount_data = cifs_compose_mount_options( cifs_sb->mountdata, full_path + 1, referrals, &fake_devname); - kfree(fake_devname); + free_dfs_info_array(referrals, num_referrals); + kfree(fake_devname); + kfree(full_path); + + if (IS_ERR(mount_data)) { + rc = PTR_ERR(mount_data); + mount_data = NULL; + goto mount_fail_check; + } if (tcon) cifs_put_tcon(tcon); @@ -2556,8 +2604,6 @@ remote_path_check: cifs_put_smb_ses(pSesInfo); cleanup_volume_info(&volume_info); - FreeXid(xid); - kfree(full_path); referral_walks_count++; goto try_mount_again; } @@ -2611,9 +2657,9 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses, return -EIO; smb_buffer = cifs_buf_get(); - if (smb_buffer == NULL) { + if (smb_buffer == NULL) return -ENOMEM; - } + smb_buffer_response = smb_buffer; header_assemble(smb_buffer, SMB_COM_TREE_CONNECT_ANDX, @@ -2726,6 +2772,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses, strncpy(tcon->treeName, tree, MAX_TREE_SIZE); /* mostly informational -- no need to fail on error here */ + kfree(tcon->nativeFileSystem); tcon->nativeFileSystem = cifs_strndup_from_ucs(bcc_ptr, bytes_left, is_unicode, nls_codepage); diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 7dc6b74f9def..a6424cfc0121 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -153,7 +153,7 @@ cifs_fill_fileinfo(struct inode *newinode, __u16 fileHandle, mutex_init(&pCifsFile->fh_mutex); mutex_init(&pCifsFile->lock_mutex); INIT_LIST_HEAD(&pCifsFile->llist); - atomic_set(&pCifsFile->wrtPending, 0); + atomic_set(&pCifsFile->count, 1); /* set the following in open now pCifsFile->pfile = file; */ @@ -188,6 +188,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode, FILE_UNIX_BASIC_INFO *presp_data; __u32 posix_flags = 0; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct cifs_fattr fattr; cFYI(1, ("posix open %s", full_path)); @@ -236,22 +237,21 @@ int cifs_posix_open(char *full_path, struct inode **pinode, if (presp_data->Type == cpu_to_le32(-1)) goto posix_open_ret; /* open ok, caller does qpathinfo */ - /* get new inode and set it up */ if (!pinode) goto posix_open_ret; /* caller does not need info */ + cifs_unix_basic_to_fattr(&fattr, presp_data, cifs_sb); + + /* get new inode and set it up */ if (*pinode == NULL) { - __u64 unique_id = le64_to_cpu(presp_data->UniqueId); - *pinode = cifs_new_inode(sb, &unique_id); + *pinode = cifs_iget(sb, &fattr); + if (!*pinode) { + rc = -ENOMEM; + goto posix_open_ret; + } + } else { + cifs_fattr_to_inode(*pinode, &fattr); } - /* else an inode was passed in. Update its info, don't create one */ - - /* We do not need to close the file if new_inode fails since - the caller will retry qpathinfo as long as inode is null */ - if (*pinode == NULL) - goto posix_open_ret; - - posix_fill_in_inode(*pinode, presp_data, 1); cifs_fill_fileinfo(*pinode, *pnetfid, cifs_sb->tcon, write_only); @@ -425,9 +425,10 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, args.uid = NO_CHANGE_64; args.gid = NO_CHANGE_64; } - CIFSSMBUnixSetInfo(xid, tcon, full_path, &args, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); } else { /* BB implement mode setting via Windows security descriptors e.g. */ @@ -515,10 +516,10 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode, args.uid = NO_CHANGE_64; args.gid = NO_CHANGE_64; } - rc = CIFSSMBUnixSetInfo(xid, pTcon, full_path, - &args, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, &args, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); if (!rc) { rc = cifs_get_inode_info_unix(&newinode, full_path, @@ -643,6 +644,15 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, } } + /* + * O_EXCL: optimize away the lookup, but don't hash the dentry. Let + * the VFS handle the create. + */ + if (nd->flags & LOOKUP_EXCL) { + d_instantiate(direntry, NULL); + return 0; + } + /* can not grab the rename sem here since it would deadlock in the cases (beginning of sys_rename itself) in which we already have the sb rename sem */ diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 97ce4bf89d15..fa7beac8b80e 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -53,11 +53,9 @@ static inline struct cifsFileInfo *cifs_init_private( private_data->pInode = inode; private_data->invalidHandle = false; private_data->closePend = false; - /* we have to track num writers to the inode, since writepages - does not tell us which handle the write is for so there can - be a close (overlapping with write) of the filehandle that - cifs_writepages chose to use */ - atomic_set(&private_data->wrtPending, 0); + /* Initialize reference count to one. The private data is + freed on the release of the last reference */ + atomic_set(&private_data->count, 1); return private_data; } @@ -448,9 +446,9 @@ int cifs_open(struct inode *inode, struct file *file) .mtime = NO_CHANGE_64, .device = 0, }; - CIFSSMBUnixSetInfo(xid, tcon, full_path, &args, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & + CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); } } @@ -643,7 +641,7 @@ int cifs_close(struct inode *inode, struct file *file) if (!pTcon->need_reconnect) { write_unlock(&GlobalSMBSeslock); timeout = 2; - while ((atomic_read(&pSMBFile->wrtPending) != 0) + while ((atomic_read(&pSMBFile->count) != 1) && (timeout <= 2048)) { /* Give write a better chance to get to server ahead of the close. We do not @@ -657,8 +655,6 @@ int cifs_close(struct inode *inode, struct file *file) msleep(timeout); timeout *= 4; } - if (atomic_read(&pSMBFile->wrtPending)) - cERROR(1, ("close with pending write")); if (!pTcon->need_reconnect && !pSMBFile->invalidHandle) rc = CIFSSMBClose(xid, pTcon, @@ -681,24 +677,7 @@ int cifs_close(struct inode *inode, struct file *file) list_del(&pSMBFile->flist); list_del(&pSMBFile->tlist); write_unlock(&GlobalSMBSeslock); - timeout = 10; - /* We waited above to give the SMBWrite a chance to issue - on the wire (so we do not get SMBWrite returning EBADF - if writepages is racing with close. Note that writepages - does not specify a file handle, so it is possible for a file - to be opened twice, and the application close the "wrong" - file handle - in these cases we delay long enough to allow - the SMBWrite to get on the wire before the SMB Close. - We allow total wait here over 45 seconds, more than - oplock break time, and more than enough to allow any write - to complete on the server, or to time out on the client */ - while ((atomic_read(&pSMBFile->wrtPending) != 0) - && (timeout <= 50000)) { - cERROR(1, ("writes pending, delay free of handle")); - msleep(timeout); - timeout *= 8; - } - kfree(file->private_data); + cifsFileInfo_put(file->private_data); file->private_data = NULL; } else rc = -EBADF; @@ -1236,7 +1215,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode) if (!open_file->invalidHandle) { /* found a good file */ /* lock it so it will not be closed on us */ - atomic_inc(&open_file->wrtPending); + cifsFileInfo_get(open_file); read_unlock(&GlobalSMBSeslock); return open_file; } /* else might as well continue, and look for @@ -1276,7 +1255,7 @@ refind_writable: if (open_file->pfile && ((open_file->pfile->f_flags & O_RDWR) || (open_file->pfile->f_flags & O_WRONLY))) { - atomic_inc(&open_file->wrtPending); + cifsFileInfo_get(open_file); if (!open_file->invalidHandle) { /* found a good writable file */ @@ -1293,7 +1272,7 @@ refind_writable: else { /* start over in case this was deleted */ /* since the list could be modified */ read_lock(&GlobalSMBSeslock); - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); goto refind_writable; } } @@ -1309,7 +1288,7 @@ refind_writable: read_lock(&GlobalSMBSeslock); /* can not use this handle, no write pending on this one after all */ - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); if (open_file->closePend) /* list could have changed */ goto refind_writable; @@ -1373,7 +1352,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) if (open_file) { bytes_written = cifs_write(open_file->pfile, write_data, to-from, &offset); - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); /* Does mm or vfs already set times? */ inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb); if ((bytes_written > 0) && (offset)) @@ -1562,7 +1541,7 @@ retry: bytes_to_write, offset, &bytes_written, iov, n_iov, long_op); - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); cifs_update_eof(cifsi, offset, bytes_written); if (rc || bytes_written < bytes_to_write) { diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 155c9e785d0c..1f09c7619319 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -77,239 +77,202 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral) } } -static void cifs_unix_info_to_inode(struct inode *inode, - FILE_UNIX_BASIC_INFO *info, int force_uid_gid) +/* populate an inode with info from a cifs_fattr struct */ +void +cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) { + struct cifsInodeInfo *cifs_i = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); - struct cifsInodeInfo *cifsInfo = CIFS_I(inode); - __u64 num_of_bytes = le64_to_cpu(info->NumOfBytes); - __u64 end_of_file = le64_to_cpu(info->EndOfFile); + unsigned long oldtime = cifs_i->time; + + inode->i_atime = fattr->cf_atime; + inode->i_mtime = fattr->cf_mtime; + inode->i_ctime = fattr->cf_ctime; + inode->i_rdev = fattr->cf_rdev; + inode->i_nlink = fattr->cf_nlink; + inode->i_uid = fattr->cf_uid; + inode->i_gid = fattr->cf_gid; + + /* if dynperm is set, don't clobber existing mode */ + if (inode->i_state & I_NEW || + !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)) + inode->i_mode = fattr->cf_mode; + + cifs_i->cifsAttrs = fattr->cf_cifsattrs; + cifs_i->uniqueid = fattr->cf_uniqueid; + + if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL) + cifs_i->time = 0; + else + cifs_i->time = jiffies; + + cFYI(1, ("inode 0x%p old_time=%ld new_time=%ld", inode, + oldtime, cifs_i->time)); - inode->i_atime = cifs_NTtimeToUnix(info->LastAccessTime); - inode->i_mtime = - cifs_NTtimeToUnix(info->LastModificationTime); - inode->i_ctime = cifs_NTtimeToUnix(info->LastStatusChange); - inode->i_mode = le64_to_cpu(info->Permissions); + cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING; + + /* + * Can't safely change the file size here if the client is writing to + * it due to potential races. + */ + spin_lock(&inode->i_lock); + if (is_size_safe_to_change(cifs_i, fattr->cf_eof)) { + i_size_write(inode, fattr->cf_eof); + + /* + * i_blocks is not related to (i_size / i_blksize), + * but instead 512 byte (2**9) size is required for + * calculating num blocks. + */ + inode->i_blocks = (512 - 1 + fattr->cf_bytes) >> 9; + } + spin_unlock(&inode->i_lock); + + cifs_set_ops(inode, fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL); +} + +/* Fill a cifs_fattr struct with info from FILE_UNIX_BASIC_INFO. */ +void +cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info, + struct cifs_sb_info *cifs_sb) +{ + memset(fattr, 0, sizeof(*fattr)); + fattr->cf_uniqueid = le64_to_cpu(info->UniqueId); + fattr->cf_bytes = le64_to_cpu(info->NumOfBytes); + fattr->cf_eof = le64_to_cpu(info->EndOfFile); + + fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime); + fattr->cf_mtime = cifs_NTtimeToUnix(info->LastModificationTime); + fattr->cf_ctime = cifs_NTtimeToUnix(info->LastStatusChange); + fattr->cf_mode = le64_to_cpu(info->Permissions); /* * Since we set the inode type below we need to mask off * to avoid strange results if bits set above. */ - inode->i_mode &= ~S_IFMT; + fattr->cf_mode &= ~S_IFMT; switch (le32_to_cpu(info->Type)) { case UNIX_FILE: - inode->i_mode |= S_IFREG; + fattr->cf_mode |= S_IFREG; + fattr->cf_dtype = DT_REG; break; case UNIX_SYMLINK: - inode->i_mode |= S_IFLNK; + fattr->cf_mode |= S_IFLNK; + fattr->cf_dtype = DT_LNK; break; case UNIX_DIR: - inode->i_mode |= S_IFDIR; + fattr->cf_mode |= S_IFDIR; + fattr->cf_dtype = DT_DIR; break; case UNIX_CHARDEV: - inode->i_mode |= S_IFCHR; - inode->i_rdev = MKDEV(le64_to_cpu(info->DevMajor), - le64_to_cpu(info->DevMinor) & MINORMASK); + fattr->cf_mode |= S_IFCHR; + fattr->cf_dtype = DT_CHR; + fattr->cf_rdev = MKDEV(le64_to_cpu(info->DevMajor), + le64_to_cpu(info->DevMinor) & MINORMASK); break; case UNIX_BLOCKDEV: - inode->i_mode |= S_IFBLK; - inode->i_rdev = MKDEV(le64_to_cpu(info->DevMajor), - le64_to_cpu(info->DevMinor) & MINORMASK); + fattr->cf_mode |= S_IFBLK; + fattr->cf_dtype = DT_BLK; + fattr->cf_rdev = MKDEV(le64_to_cpu(info->DevMajor), + le64_to_cpu(info->DevMinor) & MINORMASK); break; case UNIX_FIFO: - inode->i_mode |= S_IFIFO; + fattr->cf_mode |= S_IFIFO; + fattr->cf_dtype = DT_FIFO; break; case UNIX_SOCKET: - inode->i_mode |= S_IFSOCK; + fattr->cf_mode |= S_IFSOCK; + fattr->cf_dtype = DT_SOCK; break; default: /* safest to call it a file if we do not know */ - inode->i_mode |= S_IFREG; + fattr->cf_mode |= S_IFREG; + fattr->cf_dtype = DT_REG; cFYI(1, ("unknown type %d", le32_to_cpu(info->Type))); break; } - if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) && - !force_uid_gid) - inode->i_uid = cifs_sb->mnt_uid; + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) + fattr->cf_uid = cifs_sb->mnt_uid; else - inode->i_uid = le64_to_cpu(info->Uid); + fattr->cf_uid = le64_to_cpu(info->Uid); - if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) && - !force_uid_gid) - inode->i_gid = cifs_sb->mnt_gid; + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) + fattr->cf_gid = cifs_sb->mnt_gid; else - inode->i_gid = le64_to_cpu(info->Gid); + fattr->cf_gid = le64_to_cpu(info->Gid); - inode->i_nlink = le64_to_cpu(info->Nlinks); - - cifsInfo->server_eof = end_of_file; - spin_lock(&inode->i_lock); - if (is_size_safe_to_change(cifsInfo, end_of_file)) { - /* - * We can not safely change the file size here if the client - * is writing to it due to potential races. - */ - i_size_write(inode, end_of_file); - - /* - * i_blocks is not related to (i_size / i_blksize), - * but instead 512 byte (2**9) size is required for - * calculating num blocks. - */ - inode->i_blocks = (512 - 1 + num_of_bytes) >> 9; - } - spin_unlock(&inode->i_lock); + fattr->cf_nlink = le64_to_cpu(info->Nlinks); } - /* - * Needed to setup inode data for the directory which is the - * junction to the new submount (ie to setup the fake directory - * which represents a DFS referral) - */ -static void fill_fake_finddataunix(FILE_UNIX_BASIC_INFO *pfnd_dat, - struct super_block *sb) -{ - struct inode *pinode = NULL; - - memset(pfnd_dat, 0, sizeof(FILE_UNIX_BASIC_INFO)); - -/* __le64 pfnd_dat->EndOfFile = cpu_to_le64(0); - __le64 pfnd_dat->NumOfBytes = cpu_to_le64(0); - __u64 UniqueId = 0; */ - pfnd_dat->LastStatusChange = - cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); - pfnd_dat->LastAccessTime = - cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); - pfnd_dat->LastModificationTime = - cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); - pfnd_dat->Type = cpu_to_le32(UNIX_DIR); - pfnd_dat->Permissions = cpu_to_le64(S_IXUGO | S_IRWXU); - pfnd_dat->Nlinks = cpu_to_le64(2); - if (sb->s_root) - pinode = sb->s_root->d_inode; - if (pinode == NULL) - return; - - /* fill in default values for the remaining based on root - inode since we can not query the server for this inode info */ - pfnd_dat->DevMajor = cpu_to_le64(MAJOR(pinode->i_rdev)); - pfnd_dat->DevMinor = cpu_to_le64(MINOR(pinode->i_rdev)); - pfnd_dat->Uid = cpu_to_le64(pinode->i_uid); - pfnd_dat->Gid = cpu_to_le64(pinode->i_gid); -} - -/** - * cifs_new inode - create new inode, initialize, and hash it - * @sb - pointer to superblock - * @inum - if valid pointer and serverino is enabled, replace i_ino with val - * - * Create a new inode, initialize it for CIFS and hash it. Returns the new - * inode or NULL if one couldn't be allocated. + * Fill a cifs_fattr struct with fake inode info. * - * If the share isn't mounted with "serverino" or inum is a NULL pointer then - * we'll just use the inode number assigned by new_inode(). Note that this can - * mean i_ino collisions since the i_ino assigned by new_inode is not - * guaranteed to be unique. + * Needed to setup cifs_fattr data for the directory which is the + * junction to the new submount (ie to setup the fake directory + * which represents a DFS referral). */ -struct inode * -cifs_new_inode(struct super_block *sb, __u64 *inum) +static void +cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb) { - struct inode *inode; - - inode = new_inode(sb); - if (inode == NULL) - return NULL; - - /* - * BB: Is i_ino == 0 legal? Here, we assume that it is. If it isn't we - * stop passing inum as ptr. Are there sanity checks we can use to - * ensure that the server is really filling in that field? Also, - * if serverino is disabled, perhaps we should be using iunique()? - */ - if (inum && (CIFS_SB(sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) - inode->i_ino = (unsigned long) *inum; - - /* - * must set this here instead of cifs_alloc_inode since VFS will - * clobber i_flags - */ - if (sb->s_flags & MS_NOATIME) - inode->i_flags |= S_NOATIME | S_NOCMTIME; - - insert_inode_hash(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); - return inode; + cFYI(1, ("creating fake fattr for DFS referral")); + + memset(fattr, 0, sizeof(*fattr)); + fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU; + fattr->cf_uid = cifs_sb->mnt_uid; + fattr->cf_gid = cifs_sb->mnt_gid; + fattr->cf_atime = CURRENT_TIME; + fattr->cf_ctime = CURRENT_TIME; + fattr->cf_mtime = CURRENT_TIME; + fattr->cf_nlink = 2; + fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL; } int cifs_get_inode_info_unix(struct inode **pinode, - const unsigned char *full_path, struct super_block *sb, int xid) + const unsigned char *full_path, + struct super_block *sb, int xid) { - int rc = 0; + int rc; FILE_UNIX_BASIC_INFO find_data; - struct cifsTconInfo *pTcon; - struct inode *inode; + struct cifs_fattr fattr; + struct cifsTconInfo *tcon; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); - bool is_dfs_referral = false; - struct cifsInodeInfo *cifsInfo; - __u64 num_of_bytes; - __u64 end_of_file; - pTcon = cifs_sb->tcon; + tcon = cifs_sb->tcon; cFYI(1, ("Getting info on %s", full_path)); /* could have done a find first instead but this returns more info */ - rc = CIFSSMBUnixQPathInfo(xid, pTcon, full_path, &find_data, + rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc == -EREMOTE && !is_dfs_referral) { - is_dfs_referral = true; - cFYI(DBG2, ("DFS ref")); - /* for DFS, server does not give us real inode data */ - fill_fake_finddataunix(&find_data, sb); - rc = 0; - } else if (rc) - goto cgiiu_exit; - num_of_bytes = le64_to_cpu(find_data.NumOfBytes); - end_of_file = le64_to_cpu(find_data.EndOfFile); + if (!rc) { + cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb); + } else if (rc == -EREMOTE) { + cifs_create_dfs_fattr(&fattr, sb); + rc = 0; + } else { + return rc; + } - /* get new inode */ if (*pinode == NULL) { - __u64 unique_id = le64_to_cpu(find_data.UniqueId); - *pinode = cifs_new_inode(sb, &unique_id); - if (*pinode == NULL) { + /* get new inode */ + *pinode = cifs_iget(sb, &fattr); + if (!*pinode) rc = -ENOMEM; - goto cgiiu_exit; - } + } else { + /* we already have inode, update it */ + cifs_fattr_to_inode(*pinode, &fattr); } - inode = *pinode; - cifsInfo = CIFS_I(inode); - - cFYI(1, ("Old time %ld", cifsInfo->time)); - cifsInfo->time = jiffies; - cFYI(1, ("New time %ld", cifsInfo->time)); - /* this is ok to set on every inode revalidate */ - atomic_set(&cifsInfo->inUse, 1); - - cifs_unix_info_to_inode(inode, &find_data, 0); - - if (num_of_bytes < end_of_file) - cFYI(1, ("allocation size less than end of file")); - cFYI(1, ("Size %ld and blocks %llu", - (unsigned long) inode->i_size, - (unsigned long long)inode->i_blocks)); - - cifs_set_ops(inode, is_dfs_referral); -cgiiu_exit: return rc; } -static int decode_sfu_inode(struct inode *inode, __u64 size, - const unsigned char *path, - struct cifs_sb_info *cifs_sb, int xid) +static int +cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path, + struct cifs_sb_info *cifs_sb, int xid) { int rc; int oplock = 0; @@ -321,10 +284,15 @@ static int decode_sfu_inode(struct inode *inode, __u64 size, pbuf = buf; - if (size == 0) { - inode->i_mode |= S_IFIFO; + fattr->cf_mode &= ~S_IFMT; + + if (fattr->cf_eof == 0) { + fattr->cf_mode |= S_IFIFO; + fattr->cf_dtype = DT_FIFO; return 0; - } else if (size < 8) { + } else if (fattr->cf_eof < 8) { + fattr->cf_mode |= S_IFREG; + fattr->cf_dtype = DT_REG; return -EINVAL; /* EOPNOTSUPP? */ } @@ -336,42 +304,46 @@ static int decode_sfu_inode(struct inode *inode, __u64 size, if (rc == 0) { int buf_type = CIFS_NO_BUFFER; /* Read header */ - rc = CIFSSMBRead(xid, pTcon, - netfid, + rc = CIFSSMBRead(xid, pTcon, netfid, 24 /* length */, 0 /* offset */, &bytes_read, &pbuf, &buf_type); if ((rc == 0) && (bytes_read >= 8)) { if (memcmp("IntxBLK", pbuf, 8) == 0) { cFYI(1, ("Block device")); - inode->i_mode |= S_IFBLK; + fattr->cf_mode |= S_IFBLK; + fattr->cf_dtype = DT_BLK; if (bytes_read == 24) { /* we have enough to decode dev num */ __u64 mjr; /* major */ __u64 mnr; /* minor */ mjr = le64_to_cpu(*(__le64 *)(pbuf+8)); mnr = le64_to_cpu(*(__le64 *)(pbuf+16)); - inode->i_rdev = MKDEV(mjr, mnr); + fattr->cf_rdev = MKDEV(mjr, mnr); } } else if (memcmp("IntxCHR", pbuf, 8) == 0) { cFYI(1, ("Char device")); - inode->i_mode |= S_IFCHR; + fattr->cf_mode |= S_IFCHR; + fattr->cf_dtype = DT_CHR; if (bytes_read == 24) { /* we have enough to decode dev num */ __u64 mjr; /* major */ __u64 mnr; /* minor */ mjr = le64_to_cpu(*(__le64 *)(pbuf+8)); mnr = le64_to_cpu(*(__le64 *)(pbuf+16)); - inode->i_rdev = MKDEV(mjr, mnr); + fattr->cf_rdev = MKDEV(mjr, mnr); } } else if (memcmp("IntxLNK", pbuf, 7) == 0) { cFYI(1, ("Symlink")); - inode->i_mode |= S_IFLNK; + fattr->cf_mode |= S_IFLNK; + fattr->cf_dtype = DT_LNK; } else { - inode->i_mode |= S_IFREG; /* file? */ + fattr->cf_mode |= S_IFREG; /* file? */ + fattr->cf_dtype = DT_REG; rc = -EOPNOTSUPP; } } else { - inode->i_mode |= S_IFREG; /* then it is a file */ + fattr->cf_mode |= S_IFREG; /* then it is a file */ + fattr->cf_dtype = DT_REG; rc = -EOPNOTSUPP; /* or some unknown SFU type */ } CIFSSMBClose(xid, pTcon, netfid); @@ -381,9 +353,13 @@ static int decode_sfu_inode(struct inode *inode, __u64 size, #define SFBITS_MASK (S_ISVTX | S_ISGID | S_ISUID) /* SETFILEBITS valid bits */ -static int get_sfu_mode(struct inode *inode, - const unsigned char *path, - struct cifs_sb_info *cifs_sb, int xid) +/* + * Fetch mode bits as provided by SFU. + * + * FIXME: Doesn't this clobber the type bit we got from cifs_sfu_type ? + */ +static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, + struct cifs_sb_info *cifs_sb, int xid) { #ifdef CONFIG_CIFS_XATTR ssize_t rc; @@ -391,68 +367,80 @@ static int get_sfu_mode(struct inode *inode, __u32 mode; rc = CIFSSMBQueryEA(xid, cifs_sb->tcon, path, "SETFILEBITS", - ea_value, 4 /* size of buf */, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + ea_value, 4 /* size of buf */, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc < 0) return (int)rc; else if (rc > 3) { mode = le32_to_cpu(*((__le32 *)ea_value)); - inode->i_mode &= ~SFBITS_MASK; - cFYI(1, ("special bits 0%o org mode 0%o", mode, inode->i_mode)); - inode->i_mode = (mode & SFBITS_MASK) | inode->i_mode; + fattr->cf_mode &= ~SFBITS_MASK; + cFYI(1, ("special bits 0%o org mode 0%o", mode, + fattr->cf_mode)); + fattr->cf_mode = (mode & SFBITS_MASK) | fattr->cf_mode; cFYI(1, ("special mode bits 0%o", mode)); - return 0; - } else { - return 0; } + + return 0; #else return -EOPNOTSUPP; #endif } -/* - * Needed to setup inode data for the directory which is the - * junction to the new submount (ie to setup the fake directory - * which represents a DFS referral) - */ -static void fill_fake_finddata(FILE_ALL_INFO *pfnd_dat, - struct super_block *sb) +/* Fill a cifs_fattr struct with info from FILE_ALL_INFO */ +static void +cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, + struct cifs_sb_info *cifs_sb, bool adjust_tz) { - memset(pfnd_dat, 0, sizeof(FILE_ALL_INFO)); - -/* __le64 pfnd_dat->AllocationSize = cpu_to_le64(0); - __le64 pfnd_dat->EndOfFile = cpu_to_le64(0); - __u8 pfnd_dat->DeletePending = 0; - __u8 pfnd_data->Directory = 0; - __le32 pfnd_dat->EASize = 0; - __u64 pfnd_dat->IndexNumber = 0; - __u64 pfnd_dat->IndexNumber1 = 0; */ - pfnd_dat->CreationTime = - cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); - pfnd_dat->LastAccessTime = - cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); - pfnd_dat->LastWriteTime = - cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); - pfnd_dat->ChangeTime = - cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); - pfnd_dat->Attributes = cpu_to_le32(ATTR_DIRECTORY); - pfnd_dat->NumberOfLinks = cpu_to_le32(2); + memset(fattr, 0, sizeof(*fattr)); + fattr->cf_cifsattrs = le32_to_cpu(info->Attributes); + if (info->DeletePending) + fattr->cf_flags |= CIFS_FATTR_DELETE_PENDING; + + if (info->LastAccessTime) + fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime); + else + fattr->cf_atime = CURRENT_TIME; + + fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime); + fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime); + + if (adjust_tz) { + fattr->cf_ctime.tv_sec += cifs_sb->tcon->ses->server->timeAdj; + fattr->cf_mtime.tv_sec += cifs_sb->tcon->ses->server->timeAdj; + } + + fattr->cf_eof = le64_to_cpu(info->EndOfFile); + fattr->cf_bytes = le64_to_cpu(info->AllocationSize); + + if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { + fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode; + fattr->cf_dtype = DT_DIR; + } else { + fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode; + fattr->cf_dtype = DT_REG; + + /* clear write bits if ATTR_READONLY is set */ + if (fattr->cf_cifsattrs & ATTR_READONLY) + fattr->cf_mode &= ~(S_IWUGO); + } + + fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks); + + fattr->cf_uid = cifs_sb->mnt_uid; + fattr->cf_gid = cifs_sb->mnt_gid; } int cifs_get_inode_info(struct inode **pinode, const unsigned char *full_path, FILE_ALL_INFO *pfindData, struct super_block *sb, int xid, const __u16 *pfid) { - int rc = 0; - __u32 attr; - struct cifsInodeInfo *cifsInfo; + int rc = 0, tmprc; struct cifsTconInfo *pTcon; - struct inode *inode; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); char *buf = NULL; bool adjustTZ = false; - bool is_dfs_referral = false; - umode_t default_mode; + struct cifs_fattr fattr; pTcon = cifs_sb->tcon; cFYI(1, ("Getting info on %s", full_path)); @@ -487,163 +475,85 @@ int cifs_get_inode_info(struct inode **pinode, adjustTZ = true; } } - /* dump_mem("\nQPathInfo return data",&findData, sizeof(findData)); */ - if (rc == -EREMOTE) { - is_dfs_referral = true; - fill_fake_finddata(pfindData, sb); + + if (!rc) { + cifs_all_info_to_fattr(&fattr, (FILE_ALL_INFO *) pfindData, + cifs_sb, adjustTZ); + } else if (rc == -EREMOTE) { + cifs_create_dfs_fattr(&fattr, sb); rc = 0; - } else if (rc) + } else { goto cgii_exit; + } - attr = le32_to_cpu(pfindData->Attributes); - - /* get new inode */ + /* + * If an inode wasn't passed in, then get the inode number + * + * Is an i_ino of zero legal? Can we use that to check if the server + * supports returning inode numbers? Are there other sanity checks we + * can use to ensure that the server is really filling in that field? + * + * We can not use the IndexNumber field by default from Windows or + * Samba (in ALL_INFO buf) but we can request it explicitly. The SNIA + * CIFS spec claims that this value is unique within the scope of a + * share, and the windows docs hint that it's actually unique + * per-machine. + * + * There may be higher info levels that work but are there Windows + * server or network appliances for which IndexNumber field is not + * guaranteed unique? + */ if (*pinode == NULL) { - __u64 inode_num; - __u64 *pinum = &inode_num; - - /* Is an i_ino of zero legal? Can we use that to check - if the server supports returning inode numbers? Are - there other sanity checks we can use to ensure that - the server is really filling in that field? */ - - /* We can not use the IndexNumber field by default from - Windows or Samba (in ALL_INFO buf) but we can request - it explicitly. It may not be unique presumably if - the server has multiple devices mounted under one share */ - - /* There may be higher info levels that work but are - there Windows server or network appliances for which - IndexNumber field is not guaranteed unique? */ - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { int rc1 = 0; rc1 = CIFSGetSrvInodeNumber(xid, pTcon, - full_path, pinum, + full_path, &fattr.cf_uniqueid, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc1) { cFYI(1, ("GetSrvInodeNum rc %d", rc1)); - pinum = NULL; - /* BB EOPNOSUPP disable SERVER_INUM? */ + fattr.cf_uniqueid = iunique(sb, ROOT_I); + /* disable serverino if call not supported */ + if (rc1 == -EINVAL) + cifs_sb->mnt_cifs_flags &= + ~CIFS_MOUNT_SERVER_INUM; } } else { - pinum = NULL; - } - - *pinode = cifs_new_inode(sb, pinum); - if (*pinode == NULL) { - rc = -ENOMEM; - goto cgii_exit; + fattr.cf_uniqueid = iunique(sb, ROOT_I); } - } - inode = *pinode; - cifsInfo = CIFS_I(inode); - cifsInfo->cifsAttrs = attr; - cifsInfo->delete_pending = pfindData->DeletePending ? true : false; - cFYI(1, ("Old time %ld", cifsInfo->time)); - cifsInfo->time = jiffies; - cFYI(1, ("New time %ld", cifsInfo->time)); - - /* blksize needs to be multiple of two. So safer to default to - blksize and blkbits set in superblock so 2**blkbits and blksize - will match rather than setting to: - (pTcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) & 0xFFFFFE00;*/ - - /* Linux can not store file creation time so ignore it */ - if (pfindData->LastAccessTime) - inode->i_atime = cifs_NTtimeToUnix(pfindData->LastAccessTime); - else /* do not need to use current_fs_time - time not stored */ - inode->i_atime = CURRENT_TIME; - inode->i_mtime = cifs_NTtimeToUnix(pfindData->LastWriteTime); - inode->i_ctime = cifs_NTtimeToUnix(pfindData->ChangeTime); - cFYI(DBG2, ("Attributes came in as 0x%x", attr)); - if (adjustTZ && (pTcon->ses) && (pTcon->ses->server)) { - inode->i_ctime.tv_sec += pTcon->ses->server->timeAdj; - inode->i_mtime.tv_sec += pTcon->ses->server->timeAdj; - } - - /* get default inode mode */ - if (attr & ATTR_DIRECTORY) - default_mode = cifs_sb->mnt_dir_mode; - else - default_mode = cifs_sb->mnt_file_mode; - - /* set permission bits */ - if (atomic_read(&cifsInfo->inUse) == 0 || - (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) == 0) - inode->i_mode = default_mode; - else { - /* just reenable write bits if !ATTR_READONLY */ - if ((inode->i_mode & S_IWUGO) == 0 && - (attr & ATTR_READONLY) == 0) - inode->i_mode |= (S_IWUGO & default_mode); - - inode->i_mode &= ~S_IFMT; - } - /* clear write bits if ATTR_READONLY is set */ - if (attr & ATTR_READONLY) - inode->i_mode &= ~S_IWUGO; - - /* set inode type */ - if ((attr & ATTR_SYSTEM) && - (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) { - /* no need to fix endianness on 0 */ - if (pfindData->EndOfFile == 0) - inode->i_mode |= S_IFIFO; - else if (decode_sfu_inode(inode, - le64_to_cpu(pfindData->EndOfFile), - full_path, cifs_sb, xid)) - cFYI(1, ("unknown SFU file type\n")); } else { - if (attr & ATTR_DIRECTORY) - inode->i_mode |= S_IFDIR; - else - inode->i_mode |= S_IFREG; + fattr.cf_uniqueid = CIFS_I(*pinode)->uniqueid; } - cifsInfo->server_eof = le64_to_cpu(pfindData->EndOfFile); - spin_lock(&inode->i_lock); - if (is_size_safe_to_change(cifsInfo, cifsInfo->server_eof)) { - /* can not safely shrink the file size here if the - client is writing to it due to potential races */ - i_size_write(inode, cifsInfo->server_eof); - - /* 512 bytes (2**9) is the fake blocksize that must be - used for this calculation */ - inode->i_blocks = (512 - 1 + le64_to_cpu( - pfindData->AllocationSize)) >> 9; + /* query for SFU type info if supported and needed */ + if (fattr.cf_cifsattrs & ATTR_SYSTEM && + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) { + tmprc = cifs_sfu_type(&fattr, full_path, cifs_sb, xid); + if (tmprc) + cFYI(1, ("cifs_sfu_type failed: %d", tmprc)); } - spin_unlock(&inode->i_lock); - - inode->i_nlink = le32_to_cpu(pfindData->NumberOfLinks); - /* BB fill in uid and gid here? with help from winbind? - or retrieve from NTFS stream extended attribute */ #ifdef CONFIG_CIFS_EXPERIMENTAL /* fill in 0777 bits from ACL */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { cFYI(1, ("Getting mode bits from ACL")); - acl_to_uid_mode(cifs_sb, inode, full_path, pfid); + cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path, pfid); } #endif - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) { - /* fill in remaining high mode bits e.g. SUID, VTX */ - get_sfu_mode(inode, full_path, cifs_sb, xid); - } else if (atomic_read(&cifsInfo->inUse) == 0) { - inode->i_uid = cifs_sb->mnt_uid; - inode->i_gid = cifs_sb->mnt_gid; - /* set so we do not keep refreshing these fields with - bad data after user has changed them in memory */ - atomic_set(&cifsInfo->inUse, 1); - } - - cifs_set_ops(inode, is_dfs_referral); - + /* fill in remaining high mode bits e.g. SUID, VTX */ + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) + cifs_sfu_mode(&fattr, full_path, cifs_sb, xid); + if (!*pinode) { + *pinode = cifs_iget(sb, &fattr); + if (!*pinode) + rc = -ENOMEM; + } else { + cifs_fattr_to_inode(*pinode, &fattr); + } cgii_exit: kfree(buf); @@ -695,33 +605,78 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb) return full_path; } +static int +cifs_find_inode(struct inode *inode, void *opaque) +{ + struct cifs_fattr *fattr = (struct cifs_fattr *) opaque; + + if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid) + return 0; + + return 1; +} + +static int +cifs_init_inode(struct inode *inode, void *opaque) +{ + struct cifs_fattr *fattr = (struct cifs_fattr *) opaque; + + CIFS_I(inode)->uniqueid = fattr->cf_uniqueid; + return 0; +} + +/* Given fattrs, get a corresponding inode */ +struct inode * +cifs_iget(struct super_block *sb, struct cifs_fattr *fattr) +{ + unsigned long hash; + struct inode *inode; + + cFYI(1, ("looking for uniqueid=%llu", fattr->cf_uniqueid)); + + /* hash down to 32-bits on 32-bit arch */ + hash = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid); + + inode = iget5_locked(sb, hash, cifs_find_inode, cifs_init_inode, fattr); + + /* we have fattrs in hand, update the inode */ + if (inode) { + cifs_fattr_to_inode(inode, fattr); + if (sb->s_flags & MS_NOATIME) + inode->i_flags |= S_NOATIME | S_NOCMTIME; + if (inode->i_state & I_NEW) { + inode->i_ino = hash; + unlock_new_inode(inode); + } + } + + return inode; +} + /* gets root inode */ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino) { int xid; struct cifs_sb_info *cifs_sb; - struct inode *inode; + struct inode *inode = NULL; long rc; char *full_path; - inode = iget_locked(sb, ino); - if (!inode) - return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) - return inode; - - cifs_sb = CIFS_SB(inode->i_sb); + cifs_sb = CIFS_SB(sb); full_path = cifs_build_path_to_root(cifs_sb); if (full_path == NULL) return ERR_PTR(-ENOMEM); xid = GetXid(); if (cifs_sb->tcon->unix_ext) - rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb, - xid); + rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid); else - rc = cifs_get_inode_info(&inode, full_path, NULL, inode->i_sb, + rc = cifs_get_inode_info(&inode, full_path, NULL, sb, xid, NULL); + + if (!inode) + return ERR_PTR(-ENOMEM); + if (rc && cifs_sb->tcon->ipc) { cFYI(1, ("ipc connection - fake read inode")); inode->i_mode |= S_IFDIR; @@ -737,7 +692,6 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino) return ERR_PTR(rc); } - unlock_new_inode(inode); kfree(full_path); /* can not call macro FreeXid here since in a void func @@ -846,7 +800,7 @@ set_via_filehandle: if (open_file == NULL) CIFSSMBClose(xid, pTcon, netfid); else - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); out: return rc; } @@ -1063,44 +1017,6 @@ out_reval: return rc; } -void posix_fill_in_inode(struct inode *tmp_inode, - FILE_UNIX_BASIC_INFO *pData, int isNewInode) -{ - struct cifsInodeInfo *cifsInfo = CIFS_I(tmp_inode); - loff_t local_size; - struct timespec local_mtime; - - cifsInfo->time = jiffies; - atomic_inc(&cifsInfo->inUse); - - /* save mtime and size */ - local_mtime = tmp_inode->i_mtime; - local_size = tmp_inode->i_size; - - cifs_unix_info_to_inode(tmp_inode, pData, 1); - cifs_set_ops(tmp_inode, false); - - if (!S_ISREG(tmp_inode->i_mode)) - return; - - /* - * No sense invalidating pages for new inode - * since we we have not started caching - * readahead file data yet. - */ - if (isNewInode) - return; - - if (timespec_equal(&tmp_inode->i_mtime, &local_mtime) && - (local_size == tmp_inode->i_size)) { - cFYI(1, ("inode exists but unchanged")); - } else { - /* file may have changed on server */ - cFYI(1, ("invalidate inode, readdir detected change")); - invalidate_remote_inode(tmp_inode); - } -} - int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) { int rc = 0, tmprc; @@ -1109,6 +1025,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) struct cifsTconInfo *pTcon; char *full_path = NULL; struct inode *newinode = NULL; + struct cifs_fattr fattr; cFYI(1, ("In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode)); @@ -1148,7 +1065,6 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) cFYI(1, ("posix mkdir returned 0x%x", rc)); d_drop(direntry); } else { - __u64 unique_id; if (pInfo->Type == cpu_to_le32(-1)) { /* no return info, go query for it */ kfree(pInfo); @@ -1162,20 +1078,15 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) else direntry->d_op = &cifs_dentry_ops; - unique_id = le64_to_cpu(pInfo->UniqueId); - newinode = cifs_new_inode(inode->i_sb, &unique_id); - if (newinode == NULL) { + cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb); + newinode = cifs_iget(inode->i_sb, &fattr); + if (!newinode) { kfree(pInfo); goto mkdir_get_info; } - newinode->i_nlink = 2; d_instantiate(direntry, newinode); - /* we already checked in POSIXCreate whether - frame was long enough */ - posix_fill_in_inode(direntry->d_inode, - pInfo, 1 /* NewInode */); #ifdef CONFIG_CIFS_DEBUG2 cFYI(1, ("instantiated dentry %p %s to inode %p", direntry, direntry->d_name.name, newinode)); @@ -1238,10 +1149,10 @@ mkdir_get_info: args.uid = NO_CHANGE_64; args.gid = NO_CHANGE_64; } - CIFSSMBUnixSetInfo(xid, pTcon, full_path, &args, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, &args, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); } else { if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) && (mode & S_IWUGO) == 0) { @@ -1622,6 +1533,7 @@ int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry, if (!err) { generic_fillattr(dentry->d_inode, stat); stat->blksize = CIFS_MAX_MSGSIZE; + stat->ino = CIFS_I(dentry->d_inode)->uniqueid; } return err; } @@ -1723,7 +1635,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs, __u32 npid = open_file->pid; rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid, npid, false); - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); cFYI(1, ("SetFSize for attrs rc = %d", rc)); if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { unsigned int bytes_written; @@ -1786,6 +1698,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsTconInfo *pTcon = cifs_sb->tcon; struct cifs_unix_set_info_args *args = NULL; + struct cifsFileInfo *open_file; cFYI(1, ("setattr_unix on file %s attrs->ia_valid=0x%x", direntry->d_name.name, attrs->ia_valid)); @@ -1872,10 +1785,18 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) args->ctime = NO_CHANGE_64; args->device = 0; - rc = CIFSSMBUnixSetInfo(xid, pTcon, full_path, args, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + open_file = find_writable_file(cifsInode); + if (open_file) { + u16 nfid = open_file->netfid; + u32 npid = open_file->pid; + rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid); + cifsFileInfo_put(open_file); + } else { + rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + } if (!rc) rc = inode_setattr(inode, attrs); diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 86d0055dc529..f823a4a208a7 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -63,374 +63,123 @@ static inline void dump_cifs_file_struct(struct file *file, char *label) } #endif /* DEBUG2 */ -/* Returns 1 if new inode created, 2 if both dentry and inode were */ -/* Might check in the future if inode number changed so we can rehash inode */ -static int -construct_dentry(struct qstr *qstring, struct file *file, - struct inode **ptmp_inode, struct dentry **pnew_dentry, - __u64 *inum) +/* + * Find the dentry that matches "name". If there isn't one, create one. If it's + * a negative dentry or the uniqueid changed, then drop it and recreate it. + */ +static struct dentry * +cifs_readdir_lookup(struct dentry *parent, struct qstr *name, + struct cifs_fattr *fattr) { - struct dentry *tmp_dentry = NULL; - struct super_block *sb = file->f_path.dentry->d_sb; - int rc = 0; + struct dentry *dentry, *alias; + struct inode *inode; + struct super_block *sb = parent->d_inode->i_sb; + + cFYI(1, ("For %s", name->name)); + + dentry = d_lookup(parent, name); + if (dentry) { + /* FIXME: check for inode number changes? */ + if (dentry->d_inode != NULL) + return dentry; + d_drop(dentry); + dput(dentry); + } - cFYI(1, ("For %s", qstring->name)); - - qstring->hash = full_name_hash(qstring->name, qstring->len); - tmp_dentry = d_lookup(file->f_path.dentry, qstring); - if (tmp_dentry) { - /* BB: overwrite old name? i.e. tmp_dentry->d_name and - * tmp_dentry->d_name.len?? - */ - cFYI(0, ("existing dentry with inode 0x%p", - tmp_dentry->d_inode)); - *ptmp_inode = tmp_dentry->d_inode; - if (*ptmp_inode == NULL) { - *ptmp_inode = cifs_new_inode(sb, inum); - if (*ptmp_inode == NULL) - return rc; - rc = 1; - } - } else { - tmp_dentry = d_alloc(file->f_path.dentry, qstring); - if (tmp_dentry == NULL) { - cERROR(1, ("Failed allocating dentry")); - *ptmp_inode = NULL; - return rc; - } + dentry = d_alloc(parent, name); + if (dentry == NULL) + return NULL; - if (CIFS_SB(sb)->tcon->nocase) - tmp_dentry->d_op = &cifs_ci_dentry_ops; - else - tmp_dentry->d_op = &cifs_dentry_ops; + inode = cifs_iget(sb, fattr); + if (!inode) { + dput(dentry); + return NULL; + } - *ptmp_inode = cifs_new_inode(sb, inum); - if (*ptmp_inode == NULL) - return rc; - rc = 2; + if (CIFS_SB(sb)->tcon->nocase) + dentry->d_op = &cifs_ci_dentry_ops; + else + dentry->d_op = &cifs_dentry_ops; + + alias = d_materialise_unique(dentry, inode); + if (alias != NULL) { + dput(dentry); + if (IS_ERR(alias)) + return NULL; + dentry = alias; } - tmp_dentry->d_time = jiffies; - *pnew_dentry = tmp_dentry; - return rc; + return dentry; } -static void fill_in_inode(struct inode *tmp_inode, int new_buf_type, - char *buf, unsigned int *pobject_type, int isNewInode) +static void +cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) { - loff_t local_size; - struct timespec local_mtime; - - struct cifsInodeInfo *cifsInfo = CIFS_I(tmp_inode); - struct cifs_sb_info *cifs_sb = CIFS_SB(tmp_inode->i_sb); - __u32 attr; - __u64 allocation_size; - __u64 end_of_file; - umode_t default_mode; - - /* save mtime and size */ - local_mtime = tmp_inode->i_mtime; - local_size = tmp_inode->i_size; - - if (new_buf_type) { - FILE_DIRECTORY_INFO *pfindData = (FILE_DIRECTORY_INFO *)buf; - - attr = le32_to_cpu(pfindData->ExtFileAttributes); - allocation_size = le64_to_cpu(pfindData->AllocationSize); - end_of_file = le64_to_cpu(pfindData->EndOfFile); - tmp_inode->i_atime = - cifs_NTtimeToUnix(pfindData->LastAccessTime); - tmp_inode->i_mtime = - cifs_NTtimeToUnix(pfindData->LastWriteTime); - tmp_inode->i_ctime = - cifs_NTtimeToUnix(pfindData->ChangeTime); - } else { /* legacy, OS2 and DOS style */ - int offset = cifs_sb->tcon->ses->server->timeAdj; - FIND_FILE_STANDARD_INFO *pfindData = - (FIND_FILE_STANDARD_INFO *)buf; - - tmp_inode->i_mtime = cnvrtDosUnixTm(pfindData->LastWriteDate, - pfindData->LastWriteTime, - offset); - tmp_inode->i_atime = cnvrtDosUnixTm(pfindData->LastAccessDate, - pfindData->LastAccessTime, - offset); - tmp_inode->i_ctime = cnvrtDosUnixTm(pfindData->LastWriteDate, - pfindData->LastWriteTime, - offset); - attr = le16_to_cpu(pfindData->Attributes); - allocation_size = le32_to_cpu(pfindData->AllocationSize); - end_of_file = le32_to_cpu(pfindData->DataSize); - } + fattr->cf_uid = cifs_sb->mnt_uid; + fattr->cf_gid = cifs_sb->mnt_gid; - /* Linux can not store file creation time unfortunately so ignore it */ - - cifsInfo->cifsAttrs = attr; -#ifdef CONFIG_CIFS_EXPERIMENTAL - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { - /* get more accurate mode via ACL - so force inode refresh */ - cifsInfo->time = 0; - } else -#endif /* CONFIG_CIFS_EXPERIMENTAL */ - cifsInfo->time = jiffies; - - /* treat dos attribute of read-only as read-only mode bit e.g. 555? */ - /* 2767 perms - indicate mandatory locking */ - /* BB fill in uid and gid here? with help from winbind? - or retrieve from NTFS stream extended attribute */ - if (atomic_read(&cifsInfo->inUse) == 0) { - tmp_inode->i_uid = cifs_sb->mnt_uid; - tmp_inode->i_gid = cifs_sb->mnt_gid; - } - - if (attr & ATTR_DIRECTORY) - default_mode = cifs_sb->mnt_dir_mode; - else - default_mode = cifs_sb->mnt_file_mode; - - /* set initial permissions */ - if ((atomic_read(&cifsInfo->inUse) == 0) || - (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) == 0) - tmp_inode->i_mode = default_mode; - else { - /* just reenable write bits if !ATTR_READONLY */ - if ((tmp_inode->i_mode & S_IWUGO) == 0 && - (attr & ATTR_READONLY) == 0) - tmp_inode->i_mode |= (S_IWUGO & default_mode); - - tmp_inode->i_mode &= ~S_IFMT; + if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { + fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode; + fattr->cf_dtype = DT_DIR; + } else { + fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode; + fattr->cf_dtype = DT_REG; } - /* clear write bits if ATTR_READONLY is set */ - if (attr & ATTR_READONLY) - tmp_inode->i_mode &= ~S_IWUGO; + if (fattr->cf_cifsattrs & ATTR_READONLY) + fattr->cf_mode &= ~S_IWUGO; - /* set inode type */ - if ((attr & ATTR_SYSTEM) && - (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) { - if (end_of_file == 0) { - tmp_inode->i_mode |= S_IFIFO; - *pobject_type = DT_FIFO; + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL && + fattr->cf_cifsattrs & ATTR_SYSTEM) { + if (fattr->cf_eof == 0) { + fattr->cf_mode &= ~S_IFMT; + fattr->cf_mode |= S_IFIFO; + fattr->cf_dtype = DT_FIFO; } else { /* - * trying to get the type can be slow, so just call - * this a regular file for now, and mark for reval + * trying to get the type and mode via SFU can be slow, + * so just call those regular files for now, and mark + * for reval */ - tmp_inode->i_mode |= S_IFREG; - *pobject_type = DT_REG; - cifsInfo->time = 0; - } - } else { - if (attr & ATTR_DIRECTORY) { - tmp_inode->i_mode |= S_IFDIR; - *pobject_type = DT_DIR; - } else { - tmp_inode->i_mode |= S_IFREG; - *pobject_type = DT_REG; + fattr->cf_flags |= CIFS_FATTR_NEED_REVAL; } } +} - /* can not fill in nlink here as in qpathinfo version and Unx search */ - if (atomic_read(&cifsInfo->inUse) == 0) - atomic_set(&cifsInfo->inUse, 1); - - cifsInfo->server_eof = end_of_file; - spin_lock(&tmp_inode->i_lock); - if (is_size_safe_to_change(cifsInfo, end_of_file)) { - /* can not safely change the file size here if the - client is writing to it due to potential races */ - i_size_write(tmp_inode, end_of_file); - - /* 512 bytes (2**9) is the fake blocksize that must be used */ - /* for this calculation, even though the reported blocksize is larger */ - tmp_inode->i_blocks = (512 - 1 + allocation_size) >> 9; - } - spin_unlock(&tmp_inode->i_lock); - - if (allocation_size < end_of_file) - cFYI(1, ("May be sparse file, allocation less than file size")); - cFYI(1, ("File Size %ld and blocks %llu", - (unsigned long)tmp_inode->i_size, - (unsigned long long)tmp_inode->i_blocks)); - if (S_ISREG(tmp_inode->i_mode)) { - cFYI(1, ("File inode")); - tmp_inode->i_op = &cifs_file_inode_ops; - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) { - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) - tmp_inode->i_fop = &cifs_file_direct_nobrl_ops; - else - tmp_inode->i_fop = &cifs_file_direct_ops; - } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) - tmp_inode->i_fop = &cifs_file_nobrl_ops; - else - tmp_inode->i_fop = &cifs_file_ops; - - if ((cifs_sb->tcon) && (cifs_sb->tcon->ses) && - (cifs_sb->tcon->ses->server->maxBuf < - PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)) - tmp_inode->i_data.a_ops = &cifs_addr_ops_smallbuf; - else - tmp_inode->i_data.a_ops = &cifs_addr_ops; - - if (isNewInode) - return; /* No sense invalidating pages for new inode - since have not started caching readahead file - data yet */ - - if (timespec_equal(&tmp_inode->i_mtime, &local_mtime) && - (local_size == tmp_inode->i_size)) { - cFYI(1, ("inode exists but unchanged")); - } else { - /* file may have changed on server */ - cFYI(1, ("invalidate inode, readdir detected change")); - invalidate_remote_inode(tmp_inode); - } - } else if (S_ISDIR(tmp_inode->i_mode)) { - cFYI(1, ("Directory inode")); - tmp_inode->i_op = &cifs_dir_inode_ops; - tmp_inode->i_fop = &cifs_dir_ops; - } else if (S_ISLNK(tmp_inode->i_mode)) { - cFYI(1, ("Symbolic Link inode")); - tmp_inode->i_op = &cifs_symlink_inode_ops; - } else { - cFYI(1, ("Init special inode")); - init_special_inode(tmp_inode, tmp_inode->i_mode, - tmp_inode->i_rdev); - } +void +cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info, + struct cifs_sb_info *cifs_sb) +{ + memset(fattr, 0, sizeof(*fattr)); + fattr->cf_cifsattrs = le32_to_cpu(info->ExtFileAttributes); + fattr->cf_eof = le64_to_cpu(info->EndOfFile); + fattr->cf_bytes = le64_to_cpu(info->AllocationSize); + fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime); + fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime); + fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime); + + cifs_fill_common_info(fattr, cifs_sb); } -static void unix_fill_in_inode(struct inode *tmp_inode, - FILE_UNIX_INFO *pfindData, unsigned int *pobject_type, int isNewInode) +void +cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info, + struct cifs_sb_info *cifs_sb) { - loff_t local_size; - struct timespec local_mtime; - - struct cifsInodeInfo *cifsInfo = CIFS_I(tmp_inode); - struct cifs_sb_info *cifs_sb = CIFS_SB(tmp_inode->i_sb); - - __u32 type = le32_to_cpu(pfindData->Type); - __u64 num_of_bytes = le64_to_cpu(pfindData->NumOfBytes); - __u64 end_of_file = le64_to_cpu(pfindData->EndOfFile); - cifsInfo->time = jiffies; - atomic_inc(&cifsInfo->inUse); - - /* save mtime and size */ - local_mtime = tmp_inode->i_mtime; - local_size = tmp_inode->i_size; - - tmp_inode->i_atime = - cifs_NTtimeToUnix(pfindData->LastAccessTime); - tmp_inode->i_mtime = - cifs_NTtimeToUnix(pfindData->LastModificationTime); - tmp_inode->i_ctime = - cifs_NTtimeToUnix(pfindData->LastStatusChange); - - tmp_inode->i_mode = le64_to_cpu(pfindData->Permissions); - /* since we set the inode type below we need to mask off type - to avoid strange results if bits above were corrupt */ - tmp_inode->i_mode &= ~S_IFMT; - if (type == UNIX_FILE) { - *pobject_type = DT_REG; - tmp_inode->i_mode |= S_IFREG; - } else if (type == UNIX_SYMLINK) { - *pobject_type = DT_LNK; - tmp_inode->i_mode |= S_IFLNK; - } else if (type == UNIX_DIR) { - *pobject_type = DT_DIR; - tmp_inode->i_mode |= S_IFDIR; - } else if (type == UNIX_CHARDEV) { - *pobject_type = DT_CHR; - tmp_inode->i_mode |= S_IFCHR; - tmp_inode->i_rdev = MKDEV(le64_to_cpu(pfindData->DevMajor), - le64_to_cpu(pfindData->DevMinor) & MINORMASK); - } else if (type == UNIX_BLOCKDEV) { - *pobject_type = DT_BLK; - tmp_inode->i_mode |= S_IFBLK; - tmp_inode->i_rdev = MKDEV(le64_to_cpu(pfindData->DevMajor), - le64_to_cpu(pfindData->DevMinor) & MINORMASK); - } else if (type == UNIX_FIFO) { - *pobject_type = DT_FIFO; - tmp_inode->i_mode |= S_IFIFO; - } else if (type == UNIX_SOCKET) { - *pobject_type = DT_SOCK; - tmp_inode->i_mode |= S_IFSOCK; - } else { - /* safest to just call it a file */ - *pobject_type = DT_REG; - tmp_inode->i_mode |= S_IFREG; - cFYI(1, ("unknown inode type %d", type)); - } + int offset = cifs_sb->tcon->ses->server->timeAdj; - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) - tmp_inode->i_uid = cifs_sb->mnt_uid; - else - tmp_inode->i_uid = le64_to_cpu(pfindData->Uid); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) - tmp_inode->i_gid = cifs_sb->mnt_gid; - else - tmp_inode->i_gid = le64_to_cpu(pfindData->Gid); - tmp_inode->i_nlink = le64_to_cpu(pfindData->Nlinks); - - cifsInfo->server_eof = end_of_file; - spin_lock(&tmp_inode->i_lock); - if (is_size_safe_to_change(cifsInfo, end_of_file)) { - /* can not safely change the file size here if the - client is writing to it due to potential races */ - i_size_write(tmp_inode, end_of_file); - - /* 512 bytes (2**9) is the fake blocksize that must be used */ - /* for this calculation, not the real blocksize */ - tmp_inode->i_blocks = (512 - 1 + num_of_bytes) >> 9; - } - spin_unlock(&tmp_inode->i_lock); + memset(fattr, 0, sizeof(*fattr)); + fattr->cf_atime = cnvrtDosUnixTm(info->LastAccessDate, + info->LastAccessTime, offset); + fattr->cf_ctime = cnvrtDosUnixTm(info->LastWriteDate, + info->LastWriteTime, offset); + fattr->cf_mtime = cnvrtDosUnixTm(info->LastWriteDate, + info->LastWriteTime, offset); - if (S_ISREG(tmp_inode->i_mode)) { - cFYI(1, ("File inode")); - tmp_inode->i_op = &cifs_file_inode_ops; + fattr->cf_cifsattrs = le16_to_cpu(info->Attributes); + fattr->cf_bytes = le32_to_cpu(info->AllocationSize); + fattr->cf_eof = le32_to_cpu(info->DataSize); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) { - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) - tmp_inode->i_fop = &cifs_file_direct_nobrl_ops; - else - tmp_inode->i_fop = &cifs_file_direct_ops; - } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) - tmp_inode->i_fop = &cifs_file_nobrl_ops; - else - tmp_inode->i_fop = &cifs_file_ops; - - if ((cifs_sb->tcon) && (cifs_sb->tcon->ses) && - (cifs_sb->tcon->ses->server->maxBuf < - PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)) - tmp_inode->i_data.a_ops = &cifs_addr_ops_smallbuf; - else - tmp_inode->i_data.a_ops = &cifs_addr_ops; - - if (isNewInode) - return; /* No sense invalidating pages for new inode - since we have not started caching readahead - file data for it yet */ - - if (timespec_equal(&tmp_inode->i_mtime, &local_mtime) && - (local_size == tmp_inode->i_size)) { - cFYI(1, ("inode exists but unchanged")); - } else { - /* file may have changed on server */ - cFYI(1, ("invalidate inode, readdir detected change")); - invalidate_remote_inode(tmp_inode); - } - } else if (S_ISDIR(tmp_inode->i_mode)) { - cFYI(1, ("Directory inode")); - tmp_inode->i_op = &cifs_dir_inode_ops; - tmp_inode->i_fop = &cifs_dir_ops; - } else if (S_ISLNK(tmp_inode->i_mode)) { - cFYI(1, ("Symbolic Link inode")); - tmp_inode->i_op = &cifs_symlink_inode_ops; -/* tmp_inode->i_fop = *//* do not need to set to anything */ - } else { - cFYI(1, ("Special inode")); - init_special_inode(tmp_inode, tmp_inode->i_mode, - tmp_inode->i_rdev); - } + cifs_fill_common_info(fattr, cifs_sb); } /* BB eventually need to add the following helper function to @@ -872,7 +621,7 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst, len = strnlen(filename, PATH_MAX); } - *pinum = le64_to_cpu(pFindData->UniqueId); + *pinum = le64_to_cpu(pFindData->basic.UniqueId); } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) { FILE_DIRECTORY_INFO *pFindData = (FILE_DIRECTORY_INFO *)current_entry; @@ -932,11 +681,12 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir, int rc = 0; struct qstr qstring; struct cifsFileInfo *pCifsF; - unsigned int obj_type; - __u64 inum; + u64 inum; + ino_t ino; + struct super_block *sb; struct cifs_sb_info *cifs_sb; - struct inode *tmp_inode; struct dentry *tmp_dentry; + struct cifs_fattr fattr; /* get filename and len into qstring */ /* get dentry */ @@ -954,60 +704,53 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir, if (rc != 0) return 0; - cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + sb = file->f_path.dentry->d_sb; + cifs_sb = CIFS_SB(sb); qstring.name = scratch_buf; rc = cifs_get_name_from_search_buf(&qstring, pfindEntry, pCifsF->srch_inf.info_level, pCifsF->srch_inf.unicode, cifs_sb, - max_len, - &inum /* returned */); + max_len, &inum /* returned */); if (rc) return rc; - /* only these two infolevels return valid inode numbers */ - if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_UNIX || - pCifsF->srch_inf.info_level == SMB_FIND_FILE_ID_FULL_DIR_INFO) - rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry, - &inum); - else - rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry, - NULL); - - if ((tmp_inode == NULL) || (tmp_dentry == NULL)) - return -ENOMEM; - - /* we pass in rc below, indicating whether it is a new inode, - so we can figure out whether to invalidate the inode cached - data if the file has changed */ if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_UNIX) - unix_fill_in_inode(tmp_inode, - (FILE_UNIX_INFO *)pfindEntry, - &obj_type, rc); + cifs_unix_basic_to_fattr(&fattr, + &((FILE_UNIX_INFO *) pfindEntry)->basic, + cifs_sb); else if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD) - fill_in_inode(tmp_inode, 0 /* old level 1 buffer type */, - pfindEntry, &obj_type, rc); + cifs_std_info_to_fattr(&fattr, (FIND_FILE_STANDARD_INFO *) + pfindEntry, cifs_sb); else - fill_in_inode(tmp_inode, 1 /* NT */, pfindEntry, &obj_type, rc); + cifs_dir_info_to_fattr(&fattr, (FILE_DIRECTORY_INFO *) + pfindEntry, cifs_sb); - if (rc) /* new inode - needs to be tied to dentry */ { - d_instantiate(tmp_dentry, tmp_inode); - if (rc == 2) - d_rehash(tmp_dentry); - } + /* FIXME: make _to_fattr functions fill this out */ + if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_ID_FULL_DIR_INFO) + fattr.cf_uniqueid = inum; + else + fattr.cf_uniqueid = iunique(sb, ROOT_I); + ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid); + tmp_dentry = cifs_readdir_lookup(file->f_dentry, &qstring, &fattr); rc = filldir(direntry, qstring.name, qstring.len, file->f_pos, - tmp_inode->i_ino, obj_type); + ino, fattr.cf_dtype); + + /* + * we can not return filldir errors to the caller since they are + * "normal" when the stat blocksize is too small - we return remapped + * error instead + * + * FIXME: This looks bogus. filldir returns -EOVERFLOW in the above + * case already. Why should we be clobbering other errors from it? + */ if (rc) { cFYI(1, ("filldir rc = %d", rc)); - /* we can not return filldir errors to the caller - since they are "normal" when the stat blocksize - is too small - we return remapped error instead */ rc = -EOVERFLOW; } - dput(tmp_dentry); return rc; } diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 0ad3e2d116a6..1da4ab250eae 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -119,20 +119,19 @@ AllocOplockQEntry(struct inode *pinode, __u16 fid, struct cifsTconInfo *tcon) temp->pinode = pinode; temp->tcon = tcon; temp->netfid = fid; - spin_lock(&GlobalMid_Lock); - list_add_tail(&temp->qhead, &GlobalOplock_Q); - spin_unlock(&GlobalMid_Lock); + spin_lock(&cifs_oplock_lock); + list_add_tail(&temp->qhead, &cifs_oplock_list); + spin_unlock(&cifs_oplock_lock); } return temp; - } void DeleteOplockQEntry(struct oplock_q_entry *oplockEntry) { - spin_lock(&GlobalMid_Lock); + spin_lock(&cifs_oplock_lock); /* should we check if list empty first? */ list_del(&oplockEntry->qhead); - spin_unlock(&GlobalMid_Lock); + spin_unlock(&cifs_oplock_lock); kmem_cache_free(cifs_oplock_cachep, oplockEntry); } @@ -144,14 +143,14 @@ void DeleteTconOplockQEntries(struct cifsTconInfo *tcon) if (tcon == NULL) return; - spin_lock(&GlobalMid_Lock); - list_for_each_entry(temp, &GlobalOplock_Q, qhead) { + spin_lock(&cifs_oplock_lock); + list_for_each_entry(temp, &cifs_oplock_list, qhead) { if ((temp->tcon) && (temp->tcon == tcon)) { list_del(&temp->qhead); kmem_cache_free(cifs_oplock_cachep, temp); } } - spin_unlock(&GlobalMid_Lock); + spin_unlock(&cifs_oplock_lock); } static int diff --git a/fs/compat.c b/fs/compat.c index cdd51a3a7c53..6d6f98fe64a0 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -32,7 +32,6 @@ #include <linux/smb_mount.h> #include <linux/ncp_mount.h> #include <linux/nfs4_mount.h> -#include <linux/smp_lock.h> #include <linux/syscalls.h> #include <linux/ctype.h> #include <linux/module.h> @@ -1486,20 +1485,15 @@ int compat_do_execve(char * filename, if (!bprm) goto out_files; - retval = mutex_lock_interruptible(¤t->cred_guard_mutex); - if (retval < 0) + retval = prepare_bprm_creds(bprm); + if (retval) goto out_free; - current->in_execve = 1; - - retval = -ENOMEM; - bprm->cred = prepare_exec_creds(); - if (!bprm->cred) - goto out_unlock; retval = check_unsafe_exec(bprm); if (retval < 0) - goto out_unlock; + goto out_free; clear_in_exec = retval; + current->in_execve = 1; file = open_exec(filename); retval = PTR_ERR(file); @@ -1548,7 +1542,6 @@ int compat_do_execve(char * filename, /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; - mutex_unlock(¤t->cred_guard_mutex); acct_update_integrals(current); free_bprm(bprm); if (displaced) @@ -1568,10 +1561,7 @@ out_file: out_unmark: if (clear_in_exec) current->fs->in_exec = 0; - -out_unlock: current->in_execve = 0; - mutex_unlock(¤t->cred_guard_mutex); out_free: free_bprm(bprm); diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 626c7483b4de..f91fd51b32e3 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -19,6 +19,7 @@ #include <linux/compiler.h> #include <linux/sched.h> #include <linux/smp.h> +#include <linux/smp_lock.h> #include <linux/ioctl.h> #include <linux/if.h> #include <linux/if_bridge.h> @@ -1904,6 +1905,7 @@ COMPATIBLE_IOCTL(FIONCLEX) COMPATIBLE_IOCTL(FIOASYNC) COMPATIBLE_IOCTL(FIONBIO) COMPATIBLE_IOCTL(FIONREAD) /* This is also TIOCINQ */ +COMPATIBLE_IOCTL(FS_IOC_FIEMAP) /* 0x00 */ COMPATIBLE_IOCTL(FIBMAP) COMPATIBLE_IOCTL(FIGETBSZ) diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 4921e7426d95..a2f746066c5d 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -51,6 +51,7 @@ static const struct address_space_operations configfs_aops = { }; static struct backing_dev_info configfs_backing_dev_info = { + .name = "configfs", .ra_pages = 0, /* No readahead */ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; diff --git a/fs/dcache.c b/fs/dcache.c index 9e5cd3c3a6ba..a100fa35a48f 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -32,6 +32,7 @@ #include <linux/swap.h> #include <linux/bootmem.h> #include <linux/fs_struct.h> +#include <linux/hardirq.h> #include "internal.h" int sysctl_vfs_cache_pressure __read_mostly = 100; diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 205ec95b347e..eb507c453c5f 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -435,7 +435,7 @@ static int search_rsb(struct dlm_ls *ls, char *name, int len, int b, static int find_rsb(struct dlm_ls *ls, char *name, int namelen, unsigned int flags, struct dlm_rsb **r_ret) { - struct dlm_rsb *r, *tmp; + struct dlm_rsb *r = NULL, *tmp; uint32_t hash, bucket; int error = -EINVAL; diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index cdb580a9c7a2..240cef14fe58 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -106,6 +106,7 @@ struct connection { #define CF_CONNECT_PENDING 3 #define CF_INIT_PENDING 4 #define CF_IS_OTHERCON 5 +#define CF_CLOSE 6 struct list_head writequeue; /* List of outgoing writequeue_entries */ spinlock_t writequeue_lock; int (*rx_action) (struct connection *); /* What to do when active */ @@ -299,6 +300,8 @@ static void lowcomms_write_space(struct sock *sk) static inline void lowcomms_connect_sock(struct connection *con) { + if (test_bit(CF_CLOSE, &con->flags)) + return; if (!test_and_set_bit(CF_CONNECT_PENDING, &con->flags)) queue_work(send_workqueue, &con->swork); } @@ -902,7 +905,7 @@ static void tcp_connect_to_sock(struct connection *con) int result = -EHOSTUNREACH; struct sockaddr_storage saddr, src_addr; int addr_len; - struct socket *sock; + struct socket *sock = NULL; if (con->nodeid == 0) { log_print("attempt to connect sock 0 foiled"); @@ -926,10 +929,8 @@ static void tcp_connect_to_sock(struct connection *con) goto out_err; memset(&saddr, 0, sizeof(saddr)); - if (dlm_nodeid_to_addr(con->nodeid, &saddr)) { - sock_release(sock); + if (dlm_nodeid_to_addr(con->nodeid, &saddr)) goto out_err; - } sock->sk->sk_user_data = con; con->rx_action = receive_from_sock; @@ -962,6 +963,8 @@ out_err: if (con->sock) { sock_release(con->sock); con->sock = NULL; + } else if (sock) { + sock_release(sock); } /* * Some errors are fatal and this list might need adjusting. For other @@ -1282,7 +1285,6 @@ out: static void send_to_sock(struct connection *con) { int ret = 0; - ssize_t(*sendpage) (struct socket *, struct page *, int, size_t, int); const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; struct writequeue_entry *e; int len, offset; @@ -1291,8 +1293,6 @@ static void send_to_sock(struct connection *con) if (con->sock == NULL) goto out_connect; - sendpage = con->sock->ops->sendpage; - spin_lock(&con->writequeue_lock); for (;;) { e = list_entry(con->writequeue.next, struct writequeue_entry, @@ -1307,8 +1307,8 @@ static void send_to_sock(struct connection *con) ret = 0; if (len) { - ret = sendpage(con->sock, e->page, offset, len, - msg_flags); + ret = kernel_sendpage(con->sock, e->page, offset, len, + msg_flags); if (ret == -EAGAIN || ret == 0) { cond_resched(); goto out; @@ -1368,6 +1368,13 @@ int dlm_lowcomms_close(int nodeid) log_print("closing connection to node %d", nodeid); con = nodeid2con(nodeid, 0); if (con) { + clear_bit(CF_CONNECT_PENDING, &con->flags); + clear_bit(CF_WRITE_PENDING, &con->flags); + set_bit(CF_CLOSE, &con->flags); + if (cancel_work_sync(&con->swork)) + log_print("canceled swork for node %d", nodeid); + if (cancel_work_sync(&con->rwork)) + log_print("canceled rwork for node %d", nodeid); clean_one_writequeue(con); close_connection(con, true); } @@ -1393,9 +1400,10 @@ static void process_send_sockets(struct work_struct *work) if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) { con->connect_action(con); + set_bit(CF_WRITE_PENDING, &con->flags); } - clear_bit(CF_WRITE_PENDING, &con->flags); - send_to_sock(con); + if (test_and_clear_bit(CF_WRITE_PENDING, &con->flags)) + send_to_sock(con); } diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c index ccc9d62c462d..55ea369f43a9 100644 --- a/fs/dlm/netlink.c +++ b/fs/dlm/netlink.c @@ -63,7 +63,7 @@ static int send_data(struct sk_buff *skb) return rv; } - return genlmsg_unicast(skb, listener_nlpid); + return genlmsg_unicast(&init_net, skb, listener_nlpid); } static int user_cmd(struct sk_buff *skb, struct genl_info *info) diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index 894a32d438d5..16f682e26c07 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -353,7 +353,7 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count, { struct dlm_plock_info info; struct plock_op *op; - int found = 0; + int found = 0, do_callback = 0; if (count != sizeof(info)) return -EINVAL; @@ -366,21 +366,24 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count, spin_lock(&ops_lock); list_for_each_entry(op, &recv_list, list) { - if (op->info.fsid == info.fsid && op->info.number == info.number && + if (op->info.fsid == info.fsid && + op->info.number == info.number && op->info.owner == info.owner) { + struct plock_xop *xop = (struct plock_xop *)op; list_del_init(&op->list); - found = 1; - op->done = 1; memcpy(&op->info, &info, sizeof(info)); + if (xop->callback) + do_callback = 1; + else + op->done = 1; + found = 1; break; } } spin_unlock(&ops_lock); if (found) { - struct plock_xop *xop; - xop = (struct plock_xop *)op; - if (xop->callback) + if (do_callback) dlm_plock_callback(op); else wake_up(&recv_wq); diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index af737bb56cb7..259525c9abb8 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -1303,6 +1303,13 @@ parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat, } (*new_auth_tok)->session_key.encrypted_key_size = (body_size - (ECRYPTFS_SALT_SIZE + 5)); + if ((*new_auth_tok)->session_key.encrypted_key_size + > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) { + printk(KERN_WARNING "Tag 3 packet contains key larger " + "than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES\n"); + rc = -EINVAL; + goto out_free; + } if (unlikely(data[(*packet_size)++] != 0x04)) { printk(KERN_WARNING "Unknown version number [%d]\n", data[(*packet_size) - 1]); @@ -1449,6 +1456,12 @@ parse_tag_11_packet(unsigned char *data, unsigned char *contents, rc = -EINVAL; goto out; } + if (unlikely((*tag_11_contents_size) > max_contents_bytes)) { + printk(KERN_ERR "Literal data section in tag 11 packet exceeds " + "expected size\n"); + rc = -EINVAL; + goto out; + } if (data[(*packet_size)++] != 0x62) { printk(KERN_WARNING "Unrecognizable packet\n"); rc = -EINVAL; diff --git a/fs/exec.c b/fs/exec.c index e639957d7a57..172ceb6edde4 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -678,8 +678,8 @@ exit: } EXPORT_SYMBOL(open_exec); -int kernel_read(struct file *file, unsigned long offset, - char *addr, unsigned long count) +int kernel_read(struct file *file, loff_t offset, + char *addr, unsigned long count) { mm_segment_t old_fs; loff_t pos = offset; @@ -1016,6 +1016,35 @@ out: EXPORT_SYMBOL(flush_old_exec); /* + * Prepare credentials and lock ->cred_guard_mutex. + * install_exec_creds() commits the new creds and drops the lock. + * Or, if exec fails before, free_bprm() should release ->cred and + * and unlock. + */ +int prepare_bprm_creds(struct linux_binprm *bprm) +{ + if (mutex_lock_interruptible(¤t->cred_guard_mutex)) + return -ERESTARTNOINTR; + + bprm->cred = prepare_exec_creds(); + if (likely(bprm->cred)) + return 0; + + mutex_unlock(¤t->cred_guard_mutex); + return -ENOMEM; +} + +void free_bprm(struct linux_binprm *bprm) +{ + free_arg_pages(bprm); + if (bprm->cred) { + mutex_unlock(¤t->cred_guard_mutex); + abort_creds(bprm->cred); + } + kfree(bprm); +} + +/* * install the new credentials for this executable */ void install_exec_creds(struct linux_binprm *bprm) @@ -1024,12 +1053,13 @@ void install_exec_creds(struct linux_binprm *bprm) commit_creds(bprm->cred); bprm->cred = NULL; - - /* cred_guard_mutex must be held at least to this point to prevent + /* + * cred_guard_mutex must be held at least to this point to prevent * ptrace_attach() from altering our determination of the task's - * credentials; any time after this it may be unlocked */ - + * credentials; any time after this it may be unlocked. + */ security_bprm_committed_creds(bprm); + mutex_unlock(¤t->cred_guard_mutex); } EXPORT_SYMBOL(install_exec_creds); @@ -1246,14 +1276,6 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) EXPORT_SYMBOL(search_binary_handler); -void free_bprm(struct linux_binprm *bprm) -{ - free_arg_pages(bprm); - if (bprm->cred) - abort_creds(bprm->cred); - kfree(bprm); -} - /* * sys_execve() executes a new program. */ @@ -1277,20 +1299,15 @@ int do_execve(char * filename, if (!bprm) goto out_files; - retval = mutex_lock_interruptible(¤t->cred_guard_mutex); - if (retval < 0) + retval = prepare_bprm_creds(bprm); + if (retval) goto out_free; - current->in_execve = 1; - - retval = -ENOMEM; - bprm->cred = prepare_exec_creds(); - if (!bprm->cred) - goto out_unlock; retval = check_unsafe_exec(bprm); if (retval < 0) - goto out_unlock; + goto out_free; clear_in_exec = retval; + current->in_execve = 1; file = open_exec(filename); retval = PTR_ERR(file); @@ -1340,7 +1357,6 @@ int do_execve(char * filename, /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; - mutex_unlock(¤t->cred_guard_mutex); acct_update_integrals(current); free_bprm(bprm); if (displaced) @@ -1360,10 +1376,7 @@ out_file: out_unmark: if (clear_in_exec) current->fs->in_exec = 0; - -out_unlock: current->in_execve = 0; - mutex_unlock(¤t->cred_guard_mutex); out_free: free_bprm(bprm); diff --git a/fs/exofs/common.h b/fs/exofs/common.h index 24667eedc023..c6718e4817fe 100644 --- a/fs/exofs/common.h +++ b/fs/exofs/common.h @@ -2,9 +2,7 @@ * common.h - Common definitions for both Kernel and user-mode utilities * * Copyright (C) 2005, 2006 - * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) - * Copyright (C) 2005, 2006 - * International Business Machines + * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 * Boaz Harrosh <bharrosh@panasas.com> * diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index 65b0c8c776a1..4cfab1cc75c0 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -1,8 +1,6 @@ /* * Copyright (C) 2005, 2006 - * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) - * Copyright (C) 2005, 2006 - * International Business Machines + * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 * Boaz Harrosh <bharrosh@panasas.com> * diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 0fd4c7859679..5ec72e020b22 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -1,8 +1,6 @@ /* * Copyright (C) 2005, 2006 - * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) - * Copyright (C) 2005, 2006 - * International Business Machines + * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 * Boaz Harrosh <bharrosh@panasas.com> * @@ -156,6 +154,9 @@ ino_t exofs_parent_ino(struct dentry *child); int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *, struct inode *); +/* super.c */ +int exofs_sync_fs(struct super_block *sb, int wait); + /********************* * operation vectors * *********************/ diff --git a/fs/exofs/file.c b/fs/exofs/file.c index 6ed7fe484752..839b9dc1e70f 100644 --- a/fs/exofs/file.c +++ b/fs/exofs/file.c @@ -1,8 +1,6 @@ /* * Copyright (C) 2005, 2006 - * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) - * Copyright (C) 2005, 2006 - * International Business Machines + * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 * Boaz Harrosh <bharrosh@panasas.com> * @@ -47,16 +45,23 @@ static int exofs_file_fsync(struct file *filp, struct dentry *dentry, { int ret; struct address_space *mapping = filp->f_mapping; + struct inode *inode = dentry->d_inode; + struct super_block *sb; ret = filemap_write_and_wait(mapping); if (ret) return ret; - /*Note: file_fsync below also calles sync_blockdev, which is a no-op - * for exofs, but other then that it does sync_inode and - * sync_superblock which is what we need here. - */ - return file_fsync(filp, dentry, datasync); + /* sync the inode attributes */ + ret = write_inode_now(inode, 1); + + /* This is a good place to write the sb */ + /* TODO: Sechedule an sb-sync on create */ + sb = inode->i_sb; + if (sb->s_dirt) + exofs_sync_fs(sb, 1); + + return ret; } static int exofs_flush(struct file *file, fl_owner_t id) diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 77d0a295eb1c..6c10f7476699 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -1,8 +1,6 @@ /* * Copyright (C) 2005, 2006 - * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) - * Copyright (C) 2005, 2006 - * International Business Machines + * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 * Boaz Harrosh <bharrosh@panasas.com> * @@ -295,6 +293,9 @@ static int read_exec(struct page_collect *pcol, bool is_sync) err: if (!is_sync) _unlock_pcol_pages(pcol, ret, READ); + else /* Pages unlocked by caller in sync mode only free bio */ + pcol_free(pcol); + kfree(pcol_copy); if (or) osd_end_request(or); diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c index 77fdd765e76d..b7dd0c236863 100644 --- a/fs/exofs/namei.c +++ b/fs/exofs/namei.c @@ -1,8 +1,6 @@ /* * Copyright (C) 2005, 2006 - * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) - * Copyright (C) 2005, 2006 - * International Business Machines + * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 * Boaz Harrosh <bharrosh@panasas.com> * diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c index b3d2ccb87aaa..4372542df284 100644 --- a/fs/exofs/osd.c +++ b/fs/exofs/osd.c @@ -1,8 +1,6 @@ /* * Copyright (C) 2005, 2006 - * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) - * Copyright (C) 2005, 2006 - * International Business Machines + * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 * Boaz Harrosh <bharrosh@panasas.com> * diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 8216c5b77b53..5ab10c3bbebe 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -1,8 +1,6 @@ /* * Copyright (C) 2005, 2006 - * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) - * Copyright (C) 2005, 2006 - * International Business Machines + * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 * Boaz Harrosh <bharrosh@panasas.com> * @@ -33,6 +31,7 @@ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ +#include <linux/smp_lock.h> #include <linux/string.h> #include <linux/parser.h> #include <linux/vfs.h> @@ -200,7 +199,7 @@ static const struct export_operations exofs_export_ops; /* * Write the superblock to the OSD */ -static int exofs_sync_fs(struct super_block *sb, int wait) +int exofs_sync_fs(struct super_block *sb, int wait) { struct exofs_sb_info *sbi; struct exofs_fscb *fscb; diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c index 36e2d7bc7f7b..4dd687c3e747 100644 --- a/fs/exofs/symlink.c +++ b/fs/exofs/symlink.c @@ -1,8 +1,6 @@ /* * Copyright (C) 2005, 2006 - * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com) - * Copyright (C) 2005, 2006 - * International Business Machines + * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 * Boaz Harrosh <bharrosh@panasas.com> * diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index d636e1297cad..a63d44256a70 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -230,7 +230,7 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl) return error; } -static int +int ext2_check_acl(struct inode *inode, int mask) { struct posix_acl *acl = ext2_get_acl(inode, ACL_TYPE_ACCESS); @@ -246,12 +246,6 @@ ext2_check_acl(struct inode *inode, int mask) return -EAGAIN; } -int -ext2_permission(struct inode *inode, int mask) -{ - return generic_permission(inode, mask, ext2_check_acl); -} - /* * Initialize the ACLs of a new inode. Called from ext2_new_inode. * diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h index ecefe478898f..3ff6cbb9ac44 100644 --- a/fs/ext2/acl.h +++ b/fs/ext2/acl.h @@ -54,13 +54,13 @@ static inline int ext2_acl_count(size_t size) #ifdef CONFIG_EXT2_FS_POSIX_ACL /* acl.c */ -extern int ext2_permission (struct inode *, int); +extern int ext2_check_acl (struct inode *, int); extern int ext2_acl_chmod (struct inode *); extern int ext2_init_acl (struct inode *, struct inode *); #else #include <linux/sched.h> -#define ext2_permission NULL +#define ext2_check_acl NULL #define ext2_get_acl NULL #define ext2_set_acl NULL diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 2b9e47dc9222..a2f3afd1a1c1 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -85,6 +85,6 @@ const struct inode_operations ext2_file_inode_operations = { .removexattr = generic_removexattr, #endif .setattr = ext2_setattr, - .permission = ext2_permission, + .check_acl = ext2_check_acl, .fiemap = ext2_fiemap, }; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index e27130341d4f..1c1638f873a4 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -482,7 +482,7 @@ static int ext2_alloc_branch(struct inode *inode, unlock_buffer(bh); mark_buffer_dirty_inode(bh, inode); /* We used to sync bh here if IS_SYNC(inode). - * But we now rely upon generic_osync_inode() + * But we now rely upon generic_write_sync() * and b_inode_buffers. But not for directories. */ if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index 7cb4badef927..e7431309bdca 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -13,7 +13,6 @@ #include <linux/sched.h> #include <linux/compat.h> #include <linux/mount.h> -#include <linux/smp_lock.h> #include <asm/current.h> #include <asm/uaccess.h> diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index e1dedb0f7873..23701f289e98 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -362,6 +362,10 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, if (dir_de) { if (old_dir != new_dir) ext2_set_link(old_inode, dir_de, dir_page, new_dir, 0); + else { + kunmap(dir_page); + page_cache_release(dir_page); + } inode_dec_link_count(old_dir); } return 0; @@ -396,7 +400,7 @@ const struct inode_operations ext2_dir_inode_operations = { .removexattr = generic_removexattr, #endif .setattr = ext2_setattr, - .permission = ext2_permission, + .check_acl = ext2_check_acl, }; const struct inode_operations ext2_special_inode_operations = { @@ -407,5 +411,5 @@ const struct inode_operations ext2_special_inode_operations = { .removexattr = generic_removexattr, #endif .setattr = ext2_setattr, - .permission = ext2_permission, + .check_acl = ext2_check_acl, }; diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig index fb3c1a21b135..522b15498f45 100644 --- a/fs/ext3/Kconfig +++ b/fs/ext3/Kconfig @@ -29,23 +29,25 @@ config EXT3_FS module will be called ext3. config EXT3_DEFAULTS_TO_ORDERED - bool "Default to 'data=ordered' in ext3 (legacy option)" + bool "Default to 'data=ordered' in ext3" depends on EXT3_FS help - If a filesystem does not explicitly specify a data ordering - mode, and the journal capability allowed it, ext3 used to - historically default to 'data=ordered'. - - That was a rather unfortunate choice, because it leads to all - kinds of latency problems, and the 'data=writeback' mode is more - appropriate these days. - - You should probably always answer 'n' here, and if you really - want to use 'data=ordered' mode, set it in the filesystem itself - with 'tune2fs -o journal_data_ordered'. - - But if you really want to enable the legacy default, you can do - so by answering 'y' to this question. + The journal mode options for ext3 have different tradeoffs + between when data is guaranteed to be on disk and + performance. The use of "data=writeback" can cause + unwritten data to appear in files after an system crash or + power failure, which can be a security issue. However, + "data=ordered" mode can also result in major performance + problems, including seconds-long delays before an fsync() + call returns. For details, see: + + http://ext4.wiki.kernel.org/index.php/Ext3_data_mode_tradeoffs + + If you have been historically happy with ext3's performance, + data=ordered mode will be a safe choice and you should + answer 'y' here. If you understand the reliability and data + privacy issues of data=writeback and are willing to make + that trade off, answer 'n'. config EXT3_FS_XATTR bool "Ext3 extended attributes" diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index e167bae37ef0..c9b0df376b5f 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -238,7 +238,7 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type, return error; } -static int +int ext3_check_acl(struct inode *inode, int mask) { struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS); @@ -254,12 +254,6 @@ ext3_check_acl(struct inode *inode, int mask) return -EAGAIN; } -int -ext3_permission(struct inode *inode, int mask) -{ - return generic_permission(inode, mask, ext3_check_acl); -} - /* * Initialize the ACLs of a new inode. Called from ext3_new_inode. * diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h index 07d15a3a5969..597334626de9 100644 --- a/fs/ext3/acl.h +++ b/fs/ext3/acl.h @@ -54,13 +54,13 @@ static inline int ext3_acl_count(size_t size) #ifdef CONFIG_EXT3_FS_POSIX_ACL /* acl.c */ -extern int ext3_permission (struct inode *, int); +extern int ext3_check_acl (struct inode *, int); extern int ext3_acl_chmod (struct inode *); extern int ext3_init_acl (handle_t *, struct inode *, struct inode *); #else /* CONFIG_EXT3_FS_POSIX_ACL */ #include <linux/sched.h> -#define ext3_permission NULL +#define ext3_check_acl NULL static inline int ext3_acl_chmod(struct inode *inode) diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index 3d724a95882f..373fa90c796a 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -130,8 +130,7 @@ static int ext3_readdir(struct file * filp, struct buffer_head *bh = NULL; map_bh.b_state = 0; - err = ext3_get_blocks_handle(NULL, inode, blk, 1, - &map_bh, 0, 0); + err = ext3_get_blocks_handle(NULL, inode, blk, 1, &map_bh, 0); if (err > 0) { pgoff_t index = map_bh.b_blocknr >> (PAGE_CACHE_SHIFT - inode->i_blkbits); diff --git a/fs/ext3/file.c b/fs/ext3/file.c index 5b49704b231b..388bbdfa0b4e 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -51,71 +51,12 @@ static int ext3_release_file (struct inode * inode, struct file * filp) return 0; } -static ssize_t -ext3_file_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_path.dentry->d_inode; - ssize_t ret; - int err; - - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); - - /* - * Skip flushing if there was an error, or if nothing was written. - */ - if (ret <= 0) - return ret; - - /* - * If the inode is IS_SYNC, or is O_SYNC and we are doing data - * journalling then we need to make sure that we force the transaction - * to disk to keep all metadata uptodate synchronously. - */ - if (file->f_flags & O_SYNC) { - /* - * If we are non-data-journaled, then the dirty data has - * already been flushed to backing store by generic_osync_inode, - * and the inode has been flushed too if there have been any - * modifications other than mere timestamp updates. - * - * Open question --- do we care about flushing timestamps too - * if the inode is IS_SYNC? - */ - if (!ext3_should_journal_data(inode)) - return ret; - - goto force_commit; - } - - /* - * So we know that there has been no forced data flush. If the inode - * is marked IS_SYNC, we need to force one ourselves. - */ - if (!IS_SYNC(inode)) - return ret; - - /* - * Open question #2 --- should we force data to disk here too? If we - * don't, the only impact is that data=writeback filesystems won't - * flush data to disk automatically on IS_SYNC, only metadata (but - * historically, that is what ext2 has done.) - */ - -force_commit: - err = ext3_force_commit(inode->i_sb); - if (err) - return err; - return ret; -} - const struct file_operations ext3_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, .write = do_sync_write, .aio_read = generic_file_aio_read, - .aio_write = ext3_file_write, + .aio_write = generic_file_aio_write, .unlocked_ioctl = ext3_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext3_compat_ioctl, @@ -137,7 +78,7 @@ const struct inode_operations ext3_file_inode_operations = { .listxattr = ext3_listxattr, .removexattr = generic_removexattr, #endif - .permission = ext3_permission, + .check_acl = ext3_check_acl, .fiemap = ext3_fiemap, }; diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c index d33634119e17..451d166bbe93 100644 --- a/fs/ext3/fsync.c +++ b/fs/ext3/fsync.c @@ -23,6 +23,7 @@ */ #include <linux/time.h> +#include <linux/blkdev.h> #include <linux/fs.h> #include <linux/sched.h> #include <linux/writeback.h> @@ -73,7 +74,7 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) } if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) - goto out; + goto flush; /* * The VFS has written the file data. If the inode is unaltered @@ -85,7 +86,16 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) .nr_to_write = 0, /* sys_fsync did this */ }; ret = sync_inode(inode, &wbc); + goto out; } +flush: + /* + * In case we didn't commit a transaction, we have to flush + * disk caches manually so that data really is on persistent + * storage + */ + if (test_opt(inode->i_sb, BARRIER)) + blkdev_issue_flush(inode->i_sb->s_bdev, NULL); out: return ret; } diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 5f51fed5c750..cd098a7b77fc 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -172,10 +172,21 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode) * so before we call here everything must be consistently dirtied against * this transaction. */ -static int ext3_journal_test_restart(handle_t *handle, struct inode *inode) +static int truncate_restart_transaction(handle_t *handle, struct inode *inode) { + int ret; + jbd_debug(2, "restarting handle %p\n", handle); - return ext3_journal_restart(handle, blocks_for_truncate(inode)); + /* + * Drop truncate_mutex to avoid deadlock with ext3_get_blocks_handle + * At this moment, get_block can be called only for blocks inside + * i_size since page cache has been already dropped and writes are + * blocked by i_mutex. So we can safely drop the truncate_mutex. + */ + mutex_unlock(&EXT3_I(inode)->truncate_mutex); + ret = ext3_journal_restart(handle, blocks_for_truncate(inode)); + mutex_lock(&EXT3_I(inode)->truncate_mutex); + return ret; } /* @@ -788,7 +799,7 @@ err_out: int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, sector_t iblock, unsigned long maxblocks, struct buffer_head *bh_result, - int create, int extend_disksize) + int create) { int err = -EIO; int offsets[4]; @@ -911,13 +922,6 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, if (!err) err = ext3_splice_branch(handle, inode, iblock, partial, indirect_blks, count); - /* - * i_disksize growing is protected by truncate_mutex. Don't forget to - * protect it if you're about to implement concurrent - * ext3_get_block() -bzzz - */ - if (!err && extend_disksize && inode->i_size > ei->i_disksize) - ei->i_disksize = inode->i_size; mutex_unlock(&ei->truncate_mutex); if (err) goto cleanup; @@ -972,7 +976,7 @@ static int ext3_get_block(struct inode *inode, sector_t iblock, } ret = ext3_get_blocks_handle(handle, inode, iblock, - max_blocks, bh_result, create, 0); + max_blocks, bh_result, create); if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); ret = 0; @@ -1005,7 +1009,7 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode, dummy.b_blocknr = -1000; buffer_trace_init(&dummy.b_history); err = ext3_get_blocks_handle(handle, inode, block, 1, - &dummy, create, 1); + &dummy, create); /* * ext3_get_blocks_handle() returns number of blocks * mapped. 0 in case of a HOLE. @@ -1193,15 +1197,16 @@ write_begin_failed: * i_size_read because we hold i_mutex. * * Add inode to orphan list in case we crash before truncate - * finishes. + * finishes. Do this only if ext3_can_truncate() agrees so + * that orphan processing code is happy. */ - if (pos + len > inode->i_size) + if (pos + len > inode->i_size && ext3_can_truncate(inode)) ext3_orphan_add(handle, inode); ext3_journal_stop(handle); unlock_page(page); page_cache_release(page); if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); + ext3_truncate(inode); } if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) goto retry; @@ -1287,7 +1292,7 @@ static int ext3_ordered_write_end(struct file *file, * There may be allocated blocks outside of i_size because * we failed to copy some data. Prepare for truncate. */ - if (pos + len > inode->i_size) + if (pos + len > inode->i_size && ext3_can_truncate(inode)) ext3_orphan_add(handle, inode); ret2 = ext3_journal_stop(handle); if (!ret) @@ -1296,7 +1301,7 @@ static int ext3_ordered_write_end(struct file *file, page_cache_release(page); if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); + ext3_truncate(inode); return ret ? ret : copied; } @@ -1315,14 +1320,14 @@ static int ext3_writeback_write_end(struct file *file, * There may be allocated blocks outside of i_size because * we failed to copy some data. Prepare for truncate. */ - if (pos + len > inode->i_size) + if (pos + len > inode->i_size && ext3_can_truncate(inode)) ext3_orphan_add(handle, inode); ret = ext3_journal_stop(handle); unlock_page(page); page_cache_release(page); if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); + ext3_truncate(inode); return ret ? ret : copied; } @@ -1358,7 +1363,7 @@ static int ext3_journalled_write_end(struct file *file, * There may be allocated blocks outside of i_size because * we failed to copy some data. Prepare for truncate. */ - if (pos + len > inode->i_size) + if (pos + len > inode->i_size && ext3_can_truncate(inode)) ext3_orphan_add(handle, inode); EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; if (inode->i_size > EXT3_I(inode)->i_disksize) { @@ -1375,7 +1380,7 @@ static int ext3_journalled_write_end(struct file *file, page_cache_release(page); if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); + ext3_truncate(inode); return ret ? ret : copied; } @@ -2078,7 +2083,7 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode, ext3_journal_dirty_metadata(handle, bh); } ext3_mark_inode_dirty(handle, inode); - ext3_journal_test_restart(handle, inode); + truncate_restart_transaction(handle, inode); if (bh) { BUFFER_TRACE(bh, "retaking write access"); ext3_journal_get_write_access(handle, bh); @@ -2288,7 +2293,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, return; if (try_to_extend_transaction(handle, inode)) { ext3_mark_inode_dirty(handle, inode); - ext3_journal_test_restart(handle, inode); + truncate_restart_transaction(handle, inode); } ext3_free_blocks(handle, inode, nr, 1); @@ -2898,6 +2903,10 @@ static int ext3_do_update_inode(handle_t *handle, struct buffer_head *bh = iloc->bh; int err = 0, rc, block; +again: + /* we can't allow multiple procs in here at once, its a bit racey */ + lock_buffer(bh); + /* For fields not not tracking in the in-memory inode, * initialise them to zero for new inodes. */ if (ei->i_state & EXT3_STATE_NEW) @@ -2957,16 +2966,20 @@ static int ext3_do_update_inode(handle_t *handle, /* If this is the first large file * created, add a flag to the superblock. */ + unlock_buffer(bh); err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); if (err) goto out_brelse; + ext3_update_dynamic_rev(sb); EXT3_SET_RO_COMPAT_FEATURE(sb, EXT3_FEATURE_RO_COMPAT_LARGE_FILE); handle->h_sync = 1; err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + /* get our lock and start over */ + goto again; } } } @@ -2989,6 +3002,7 @@ static int ext3_do_update_inode(handle_t *handle, raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + unlock_buffer(bh); rc = ext3_journal_dirty_metadata(handle, bh); if (!err) err = rc; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 6ff7b9730234..aad6400c9b77 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -2445,7 +2445,7 @@ const struct inode_operations ext3_dir_inode_operations = { .listxattr = ext3_listxattr, .removexattr = generic_removexattr, #endif - .permission = ext3_permission, + .check_acl = ext3_check_acl, }; const struct inode_operations ext3_special_inode_operations = { @@ -2456,5 +2456,5 @@ const struct inode_operations ext3_special_inode_operations = { .listxattr = ext3_listxattr, .removexattr = generic_removexattr, #endif - .permission = ext3_permission, + .check_acl = ext3_check_acl, }; diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 524b349c6299..a8d80a7f1105 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -543,6 +543,19 @@ static inline void ext3_show_quota_options(struct seq_file *seq, struct super_bl #endif } +static char *data_mode_string(unsigned long mode) +{ + switch (mode) { + case EXT3_MOUNT_JOURNAL_DATA: + return "journal"; + case EXT3_MOUNT_ORDERED_DATA: + return "ordered"; + case EXT3_MOUNT_WRITEBACK_DATA: + return "writeback"; + } + return "unknown"; +} + /* * Show an option if * - it's set to a non-default value OR @@ -616,13 +629,8 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs) if (test_opt(sb, NOBH)) seq_puts(seq, ",nobh"); - if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA) - seq_puts(seq, ",data=journal"); - else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA) - seq_puts(seq, ",data=ordered"); - else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA) - seq_puts(seq, ",data=writeback"); - + seq_printf(seq, ",data=%s", data_mode_string(sbi->s_mount_opt & + EXT3_MOUNT_DATA_FLAGS)); if (test_opt(sb, DATA_ERR_ABORT)) seq_puts(seq, ",data_err=abort"); @@ -1024,12 +1032,18 @@ static int parse_options (char *options, struct super_block *sb, datacheck: if (is_remount) { if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS) - != data_opt) { - printk(KERN_ERR - "EXT3-fs: cannot change data " - "mode on remount\n"); - return 0; - } + == data_opt) + break; + printk(KERN_ERR + "EXT3-fs (device %s): Cannot change " + "data mode on remount. The filesystem " + "is mounted in data=%s mode and you " + "try to remount it in data=%s mode.\n", + sb->s_id, + data_mode_string(sbi->s_mount_opt & + EXT3_MOUNT_DATA_FLAGS), + data_mode_string(data_opt)); + return 0; } else { sbi->s_mount_opt &= ~EXT3_MOUNT_DATA_FLAGS; sbi->s_mount_opt |= data_opt; diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index f6d8967149ca..0df88b2a69b0 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -236,7 +236,7 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, return error; } -static int +int ext4_check_acl(struct inode *inode, int mask) { struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); @@ -252,12 +252,6 @@ ext4_check_acl(struct inode *inode, int mask) return -EAGAIN; } -int -ext4_permission(struct inode *inode, int mask) -{ - return generic_permission(inode, mask, ext4_check_acl); -} - /* * Initialize the ACLs of a new inode. Called from ext4_new_inode. * diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index 949789d2bba6..9d843d5deac4 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -54,13 +54,13 @@ static inline int ext4_acl_count(size_t size) #ifdef CONFIG_EXT4_FS_POSIX_ACL /* acl.c */ -extern int ext4_permission(struct inode *, int); +extern int ext4_check_acl(struct inode *, int); extern int ext4_acl_chmod(struct inode *); extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); #else /* CONFIG_EXT4_FS_POSIX_ACL */ #include <linux/sched.h> -#define ext4_permission NULL +#define ext4_check_acl NULL static inline int ext4_acl_chmod(struct inode *inode) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0ddf7e55abe1..9714db393efe 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -93,20 +93,20 @@ typedef unsigned int ext4_group_t; struct ext4_allocation_request { /* target inode for block we're allocating */ struct inode *inode; + /* how many blocks we want to allocate */ + unsigned int len; /* logical block in target inode */ ext4_lblk_t logical; - /* phys. target (a hint) */ - ext4_fsblk_t goal; /* the closest logical allocated block to the left */ ext4_lblk_t lleft; - /* phys. block for ^^^ */ - ext4_fsblk_t pleft; /* the closest logical allocated block to the right */ ext4_lblk_t lright; - /* phys. block for ^^^ */ + /* phys. target (a hint) */ + ext4_fsblk_t goal; + /* phys. block for the closest logical allocated block to the left */ + ext4_fsblk_t pleft; + /* phys. block for the closest logical allocated block to the right */ ext4_fsblk_t pright; - /* how many blocks we want to allocate */ - unsigned int len; /* flags. see above EXT4_MB_HINT_* */ unsigned int flags; }; diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index ad13a84644e1..eb27fd0f2ee8 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -43,6 +43,8 @@ int __ext4_journal_forget(const char *where, handle_t *handle, ext4_journal_abort_handle(where, __func__, bh, handle, err); } + else + brelse(bh); return err; } @@ -57,6 +59,8 @@ int __ext4_journal_revoke(const char *where, handle_t *handle, ext4_journal_abort_handle(where, __func__, bh, handle, err); } + else + brelse(bh); return err; } diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index be2f426f6805..139fb8cb87e4 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -131,9 +131,11 @@ int __ext4_journal_get_undo_access(const char *where, handle_t *handle, int __ext4_journal_get_write_access(const char *where, handle_t *handle, struct buffer_head *bh); +/* When called with an invalid handle, this will still do a put on the BH */ int __ext4_journal_forget(const char *where, handle_t *handle, struct buffer_head *bh); +/* When called with an invalid handle, this will still do a put on the BH */ int __ext4_journal_revoke(const char *where, handle_t *handle, ext4_fsblk_t blocknr, struct buffer_head *bh); @@ -281,10 +283,10 @@ static inline int ext4_should_order_data(struct inode *inode) static inline int ext4_should_writeback_data(struct inode *inode) { - if (EXT4_JOURNAL(inode) == NULL) - return 0; if (!S_ISREG(inode->i_mode)) return 0; + if (EXT4_JOURNAL(inode) == NULL) + return 1; if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) return 0; if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 50322a09bd01..73ebfb44ad75 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1977,6 +1977,7 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, */ /* 1 bitmap, 1 block group descriptor */ ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); + return ret; } } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 3f1873fef1c6..5ca3eca70a1e 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -58,10 +58,7 @@ static ssize_t ext4_file_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_path.dentry->d_inode; - ssize_t ret; - int err; + struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; /* * If we have encountered a bitmap-format file, the size limit @@ -81,53 +78,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, } } - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); - /* - * Skip flushing if there was an error, or if nothing was written. - */ - if (ret <= 0) - return ret; - - /* - * If the inode is IS_SYNC, or is O_SYNC and we are doing data - * journalling then we need to make sure that we force the transaction - * to disk to keep all metadata uptodate synchronously. - */ - if (file->f_flags & O_SYNC) { - /* - * If we are non-data-journaled, then the dirty data has - * already been flushed to backing store by generic_osync_inode, - * and the inode has been flushed too if there have been any - * modifications other than mere timestamp updates. - * - * Open question --- do we care about flushing timestamps too - * if the inode is IS_SYNC? - */ - if (!ext4_should_journal_data(inode)) - return ret; - - goto force_commit; - } - - /* - * So we know that there has been no forced data flush. If the inode - * is marked IS_SYNC, we need to force one ourselves. - */ - if (!IS_SYNC(inode)) - return ret; - - /* - * Open question #2 --- should we force data to disk here too? If we - * don't, the only impact is that data=writeback filesystems won't - * flush data to disk automatically on IS_SYNC, only metadata (but - * historically, that is what ext2 has done.) - */ - -force_commit: - err = ext4_force_commit(inode->i_sb); - if (err) - return err; - return ret; + return generic_file_aio_write(iocb, iov, nr_segs, pos); } static struct vm_operations_struct ext4_file_vm_ops = { @@ -207,7 +158,7 @@ const struct inode_operations ext4_file_inode_operations = { .listxattr = ext4_listxattr, .removexattr = generic_removexattr, #endif - .permission = ext4_permission, + .check_acl = ext4_check_acl, .fallocate = ext4_fallocate, .fiemap = ext4_fiemap, }; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 2f645732e3b7..29e6dc7299b8 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -833,7 +833,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode, if (!goal) goal = sbi->s_inode_goal; - if (goal && goal < le32_to_cpu(sbi->s_es->s_inodes_count)) { + if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) { group = (goal - 1) / EXT4_INODES_PER_GROUP(sb); ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb); ret2 = 0; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 60a26f3a6f8b..f9c642b22efa 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -78,16 +78,14 @@ static int ext4_inode_is_fast_symlink(struct inode *inode) * but there may still be a record of it in the journal, and that record * still needs to be revoked. * - * If the handle isn't valid we're not journaling so there's nothing to do. + * If the handle isn't valid we're not journaling, but we still need to + * call into ext4_journal_revoke() to put the buffer head. */ int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t blocknr) { int err; - if (!ext4_handle_valid(handle)) - return 0; - might_sleep(); BUFFER_TRACE(bh, "enter"); @@ -1513,14 +1511,14 @@ retry: * Add inode to orphan list in case we crash before * truncate finishes */ - if (pos + len > inode->i_size) + if (pos + len > inode->i_size && ext4_can_truncate(inode)) ext4_orphan_add(handle, inode); ext4_journal_stop(handle); if (pos + len > inode->i_size) { - vmtruncate(inode, inode->i_size); + ext4_truncate(inode); /* - * If vmtruncate failed early the inode might + * If truncate failed early the inode might * still be on the orphan list; we need to * make sure the inode is removed from the * orphan list in that case. @@ -1614,7 +1612,7 @@ static int ext4_ordered_write_end(struct file *file, ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, page, fsdata); copied = ret2; - if (pos + len > inode->i_size) + if (pos + len > inode->i_size && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them @@ -1628,9 +1626,9 @@ static int ext4_ordered_write_end(struct file *file, ret = ret2; if (pos + len > inode->i_size) { - vmtruncate(inode, inode->i_size); + ext4_truncate(inode); /* - * If vmtruncate failed early the inode might still be + * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode * is removed from the orphan list in that case. */ @@ -1655,7 +1653,7 @@ static int ext4_writeback_write_end(struct file *file, ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, page, fsdata); copied = ret2; - if (pos + len > inode->i_size) + if (pos + len > inode->i_size && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them @@ -1670,9 +1668,9 @@ static int ext4_writeback_write_end(struct file *file, ret = ret2; if (pos + len > inode->i_size) { - vmtruncate(inode, inode->i_size); + ext4_truncate(inode); /* - * If vmtruncate failed early the inode might still be + * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode * is removed from the orphan list in that case. */ @@ -1722,7 +1720,7 @@ static int ext4_journalled_write_end(struct file *file, unlock_page(page); page_cache_release(page); - if (pos + len > inode->i_size) + if (pos + len > inode->i_size && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them @@ -1733,9 +1731,9 @@ static int ext4_journalled_write_end(struct file *file, if (!ret) ret = ret2; if (pos + len > inode->i_size) { - vmtruncate(inode, inode->i_size); + ext4_truncate(inode); /* - * If vmtruncate failed early the inode might still be + * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode * is removed from the orphan list in that case. */ @@ -2305,15 +2303,9 @@ flush_it: return; } -static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) +static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) { - /* - * unmapped buffer is possible for holes. - * delay buffer is possible with delayed allocation. - * We also need to consider unwritten buffer as unmapped. - */ - return (!buffer_mapped(bh) || buffer_delay(bh) || - buffer_unwritten(bh)) && buffer_dirty(bh); + return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); } /* @@ -2398,9 +2390,9 @@ static int __mpage_da_writepage(struct page *page, * We need to try to allocate * unmapped blocks in the same page. * Otherwise we won't make progress - * with the page in ext4_da_writepage + * with the page in ext4_writepage */ - if (ext4_bh_unmapped_or_delay(NULL, bh)) { + if (ext4_bh_delay_or_unwritten(NULL, bh)) { mpage_add_bh_to_extent(mpd, logical, bh->b_size, bh->b_state); @@ -2517,7 +2509,6 @@ static int noalloc_get_block_write(struct inode *inode, sector_t iblock, * so call get_block_wrap with create = 0 */ ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0); - BUG_ON(create && ret == 0); if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); ret = 0; @@ -2525,15 +2516,102 @@ static int noalloc_get_block_write(struct inode *inode, sector_t iblock, return ret; } +static int bget_one(handle_t *handle, struct buffer_head *bh) +{ + get_bh(bh); + return 0; +} + +static int bput_one(handle_t *handle, struct buffer_head *bh) +{ + put_bh(bh); + return 0; +} + +static int __ext4_journalled_writepage(struct page *page, + struct writeback_control *wbc, + unsigned int len) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct buffer_head *page_bufs; + handle_t *handle = NULL; + int ret = 0; + int err; + + page_bufs = page_buffers(page); + BUG_ON(!page_bufs); + walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); + /* As soon as we unlock the page, it can go away, but we have + * references to buffers so we are safe */ + unlock_page(page); + + handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + ret = walk_page_buffers(handle, page_bufs, 0, len, NULL, + do_journal_get_write_access); + + err = walk_page_buffers(handle, page_bufs, 0, len, NULL, + write_end_fn); + if (ret == 0) + ret = err; + err = ext4_journal_stop(handle); + if (!ret) + ret = err; + + walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); + EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; +out: + return ret; +} + /* + * Note that we don't need to start a transaction unless we're journaling data + * because we should have holes filled from ext4_page_mkwrite(). We even don't + * need to file the inode to the transaction's list in ordered mode because if + * we are writing back data added by write(), the inode is already there and if + * we are writing back data modified via mmap(), noone guarantees in which + * transaction the data will hit the disk. In case we are journaling data, we + * cannot start transaction directly because transaction start ranks above page + * lock so we have to do some magic. + * * This function can get called via... * - ext4_da_writepages after taking page lock (have journal handle) * - journal_submit_inode_data_buffers (no journal handle) * - shrink_page_list via pdflush (no journal handle) * - grab_page_cache when doing write_begin (have journal handle) + * + * We don't do any block allocation in this function. If we have page with + * multiple blocks we need to write those buffer_heads that are mapped. This + * is important for mmaped based write. So if we do with blocksize 1K + * truncate(f, 1024); + * a = mmap(f, 0, 4096); + * a[0] = 'a'; + * truncate(f, 4096); + * we have in the page first buffer_head mapped via page_mkwrite call back + * but other bufer_heads would be unmapped but dirty(dirty done via the + * do_wp_page). So writepage should write the first block. If we modify + * the mmap area beyond 1024 we will again get a page_fault and the + * page_mkwrite callback will do the block allocation and mark the + * buffer_heads mapped. + * + * We redirty the page if we have any buffer_heads that is either delay or + * unwritten in the page. + * + * We can get recursively called as show below. + * + * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> + * ext4_writepage() + * + * But since we don't do any block allocation we should not deadlock. + * Page also have the dirty flag cleared so we don't get recurive page_lock. */ -static int ext4_da_writepage(struct page *page, - struct writeback_control *wbc) +static int ext4_writepage(struct page *page, + struct writeback_control *wbc) { int ret = 0; loff_t size; @@ -2541,7 +2619,7 @@ static int ext4_da_writepage(struct page *page, struct buffer_head *page_bufs; struct inode *inode = page->mapping->host; - trace_ext4_da_writepage(inode, page); + trace_ext4_writepage(inode, page); size = i_size_read(inode); if (page->index == size >> PAGE_CACHE_SHIFT) len = size & ~PAGE_CACHE_MASK; @@ -2551,7 +2629,7 @@ static int ext4_da_writepage(struct page *page, if (page_has_buffers(page)) { page_bufs = page_buffers(page); if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, - ext4_bh_unmapped_or_delay)) { + ext4_bh_delay_or_unwritten)) { /* * We don't want to do block allocation * So redirty the page and return @@ -2578,13 +2656,13 @@ static int ext4_da_writepage(struct page *page, * all are mapped and non delay. We don't want to * do block allocation here. */ - ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, + ret = block_prepare_write(page, 0, len, noalloc_get_block_write); if (!ret) { page_bufs = page_buffers(page); /* check whether all are mapped and non delay */ if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, - ext4_bh_unmapped_or_delay)) { + ext4_bh_delay_or_unwritten)) { redirty_page_for_writepage(wbc, page); unlock_page(page); return 0; @@ -2600,7 +2678,16 @@ static int ext4_da_writepage(struct page *page, return 0; } /* now mark the buffer_heads as dirty and uptodate */ - block_commit_write(page, 0, PAGE_CACHE_SIZE); + block_commit_write(page, 0, len); + } + + if (PageChecked(page) && ext4_should_journal_data(inode)) { + /* + * It's mmapped pagecache. Add buffers and journal it. There + * doesn't seem much point in redirtying the page here. + */ + ClearPageChecked(page); + return __ext4_journalled_writepage(page, wbc, len); } if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) @@ -2907,7 +2994,7 @@ retry: * i_size_read because we hold i_mutex. */ if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); + ext4_truncate(inode); } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) @@ -3130,222 +3217,6 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) return generic_block_bmap(mapping, block, ext4_get_block); } -static int bget_one(handle_t *handle, struct buffer_head *bh) -{ - get_bh(bh); - return 0; -} - -static int bput_one(handle_t *handle, struct buffer_head *bh) -{ - put_bh(bh); - return 0; -} - -/* - * Note that we don't need to start a transaction unless we're journaling data - * because we should have holes filled from ext4_page_mkwrite(). We even don't - * need to file the inode to the transaction's list in ordered mode because if - * we are writing back data added by write(), the inode is already there and if - * we are writing back data modified via mmap(), noone guarantees in which - * transaction the data will hit the disk. In case we are journaling data, we - * cannot start transaction directly because transaction start ranks above page - * lock so we have to do some magic. - * - * In all journaling modes block_write_full_page() will start the I/O. - * - * Problem: - * - * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> - * ext4_writepage() - * - * Similar for: - * - * ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ... - * - * Same applies to ext4_get_block(). We will deadlock on various things like - * lock_journal and i_data_sem - * - * Setting PF_MEMALLOC here doesn't work - too many internal memory - * allocations fail. - * - * 16May01: If we're reentered then journal_current_handle() will be - * non-zero. We simply *return*. - * - * 1 July 2001: @@@ FIXME: - * In journalled data mode, a data buffer may be metadata against the - * current transaction. But the same file is part of a shared mapping - * and someone does a writepage() on it. - * - * We will move the buffer onto the async_data list, but *after* it has - * been dirtied. So there's a small window where we have dirty data on - * BJ_Metadata. - * - * Note that this only applies to the last partial page in the file. The - * bit which block_write_full_page() uses prepare/commit for. (That's - * broken code anyway: it's wrong for msync()). - * - * It's a rare case: affects the final partial page, for journalled data - * where the file is subject to bith write() and writepage() in the same - * transction. To fix it we'll need a custom block_write_full_page(). - * We'll probably need that anyway for journalling writepage() output. - * - * We don't honour synchronous mounts for writepage(). That would be - * disastrous. Any write() or metadata operation will sync the fs for - * us. - * - */ -static int __ext4_normal_writepage(struct page *page, - struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - - if (test_opt(inode->i_sb, NOBH)) - return nobh_writepage(page, noalloc_get_block_write, wbc); - else - return block_write_full_page(page, noalloc_get_block_write, - wbc); -} - -static int ext4_normal_writepage(struct page *page, - struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - loff_t size = i_size_read(inode); - loff_t len; - - trace_ext4_normal_writepage(inode, page); - J_ASSERT(PageLocked(page)); - if (page->index == size >> PAGE_CACHE_SHIFT) - len = size & ~PAGE_CACHE_MASK; - else - len = PAGE_CACHE_SIZE; - - if (page_has_buffers(page)) { - /* if page has buffers it should all be mapped - * and allocated. If there are not buffers attached - * to the page we know the page is dirty but it lost - * buffers. That means that at some moment in time - * after write_begin() / write_end() has been called - * all buffers have been clean and thus they must have been - * written at least once. So they are all mapped and we can - * happily proceed with mapping them and writing the page. - */ - BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, - ext4_bh_unmapped_or_delay)); - } - - if (!ext4_journal_current_handle()) - return __ext4_normal_writepage(page, wbc); - - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; -} - -static int __ext4_journalled_writepage(struct page *page, - struct writeback_control *wbc) -{ - struct address_space *mapping = page->mapping; - struct inode *inode = mapping->host; - struct buffer_head *page_bufs; - handle_t *handle = NULL; - int ret = 0; - int err; - - ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, - noalloc_get_block_write); - if (ret != 0) - goto out_unlock; - - page_bufs = page_buffers(page); - walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, - bget_one); - /* As soon as we unlock the page, it can go away, but we have - * references to buffers so we are safe */ - unlock_page(page); - - handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - - ret = walk_page_buffers(handle, page_bufs, 0, - PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); - - err = walk_page_buffers(handle, page_bufs, 0, - PAGE_CACHE_SIZE, NULL, write_end_fn); - if (ret == 0) - ret = err; - err = ext4_journal_stop(handle); - if (!ret) - ret = err; - - walk_page_buffers(handle, page_bufs, 0, - PAGE_CACHE_SIZE, NULL, bput_one); - EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; - goto out; - -out_unlock: - unlock_page(page); -out: - return ret; -} - -static int ext4_journalled_writepage(struct page *page, - struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - loff_t size = i_size_read(inode); - loff_t len; - - trace_ext4_journalled_writepage(inode, page); - J_ASSERT(PageLocked(page)); - if (page->index == size >> PAGE_CACHE_SHIFT) - len = size & ~PAGE_CACHE_MASK; - else - len = PAGE_CACHE_SIZE; - - if (page_has_buffers(page)) { - /* if page has buffers it should all be mapped - * and allocated. If there are not buffers attached - * to the page we know the page is dirty but it lost - * buffers. That means that at some moment in time - * after write_begin() / write_end() has been called - * all buffers have been clean and thus they must have been - * written at least once. So they are all mapped and we can - * happily proceed with mapping them and writing the page. - */ - BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, - ext4_bh_unmapped_or_delay)); - } - - if (ext4_journal_current_handle()) - goto no_write; - - if (PageChecked(page)) { - /* - * It's mmapped pagecache. Add buffers and journal it. There - * doesn't seem much point in redirtying the page here. - */ - ClearPageChecked(page); - return __ext4_journalled_writepage(page, wbc); - } else { - /* - * It may be a page full of checkpoint-mode buffers. We don't - * really know unless we go poke around in the buffer_heads. - * But block_write_full_page will do the right thing. - */ - return block_write_full_page(page, noalloc_get_block_write, - wbc); - } -no_write: - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; -} - static int ext4_readpage(struct file *file, struct page *page) { return mpage_readpage(page, ext4_get_block); @@ -3492,7 +3363,7 @@ static int ext4_journalled_set_page_dirty(struct page *page) static const struct address_space_operations ext4_ordered_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, - .writepage = ext4_normal_writepage, + .writepage = ext4_writepage, .sync_page = block_sync_page, .write_begin = ext4_write_begin, .write_end = ext4_ordered_write_end, @@ -3507,7 +3378,7 @@ static const struct address_space_operations ext4_ordered_aops = { static const struct address_space_operations ext4_writeback_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, - .writepage = ext4_normal_writepage, + .writepage = ext4_writepage, .sync_page = block_sync_page, .write_begin = ext4_write_begin, .write_end = ext4_writeback_write_end, @@ -3522,7 +3393,7 @@ static const struct address_space_operations ext4_writeback_aops = { static const struct address_space_operations ext4_journalled_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, - .writepage = ext4_journalled_writepage, + .writepage = ext4_writepage, .sync_page = block_sync_page, .write_begin = ext4_write_begin, .write_end = ext4_journalled_write_end, @@ -3536,7 +3407,7 @@ static const struct address_space_operations ext4_journalled_aops = { static const struct address_space_operations ext4_da_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, - .writepage = ext4_da_writepage, + .writepage = ext4_writepage, .writepages = ext4_da_writepages, .sync_page = block_sync_page, .write_begin = ext4_da_write_begin, @@ -3583,7 +3454,8 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page; int err = 0; - page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT); + page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, + mapping_gfp_mask(mapping) & ~__GFP_FS); if (!page) return -EINVAL; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index bb415408fdb6..7050a9cd04a4 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -12,7 +12,6 @@ #include <linux/capability.h> #include <linux/time.h> #include <linux/compat.h> -#include <linux/smp_lock.h> #include <linux/mount.h> #include <linux/file.h> #include <asm/uaccess.h> @@ -192,7 +191,7 @@ setversion_out: case EXT4_IOC_GROUP_EXTEND: { ext4_fsblk_t n_blocks_count; struct super_block *sb = inode->i_sb; - int err, err2; + int err, err2=0; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; @@ -205,9 +204,11 @@ setversion_out: return err; err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count); - jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); - err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); - jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + if (EXT4_SB(sb)->s_journal) { + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + } if (err == 0) err = err2; mnt_drop_write(filp->f_path.mnt); @@ -252,7 +253,7 @@ setversion_out: case EXT4_IOC_GROUP_ADD: { struct ext4_new_group_data input; struct super_block *sb = inode->i_sb; - int err, err2; + int err, err2=0; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; @@ -266,9 +267,11 @@ setversion_out: return err; err = ext4_group_add(sb, &input); - jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); - err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); - jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + if (EXT4_SB(sb)->s_journal) { + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + } if (err == 0) err = err2; mnt_drop_write(filp->f_path.mnt); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 519a0a686d94..cd258463e2a9 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -657,7 +657,8 @@ static void ext4_mb_mark_free_simple(struct super_block *sb, } } -static void ext4_mb_generate_buddy(struct super_block *sb, +static noinline_for_stack +void ext4_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ext4_group_t group) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); @@ -1480,7 +1481,8 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, ext4_mb_check_limits(ac, e4b, 0); } -static int ext4_mb_try_best_found(struct ext4_allocation_context *ac, +static noinline_for_stack +int ext4_mb_try_best_found(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct ext4_free_extent ex = ac->ac_b_ex; @@ -1507,7 +1509,8 @@ static int ext4_mb_try_best_found(struct ext4_allocation_context *ac, return 0; } -static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, +static noinline_for_stack +int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { ext4_group_t group = ac->ac_g_ex.fe_group; @@ -1566,7 +1569,8 @@ static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, * The routine scans buddy structures (not bitmap!) from given order * to max order and tries to find big enough chunk to satisfy the req */ -static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, +static noinline_for_stack +void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct super_block *sb = ac->ac_sb; @@ -1609,7 +1613,8 @@ static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, * In order to optimize scanning, caller must pass number of * free blocks in the group, so the routine can know upper limit. */ -static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, +static noinline_for_stack +void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct super_block *sb = ac->ac_sb; @@ -1668,7 +1673,8 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, * we try to find stripe-aligned chunks for stripe-size requests * XXX should do so at least for multiples of stripe size as well */ -static void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, +static noinline_for_stack +void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct super_block *sb = ac->ac_sb; @@ -1831,7 +1837,8 @@ void ext4_mb_put_buddy_cache_lock(struct super_block *sb, } -static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) +static noinline_for_stack +int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) { int ret; @@ -2902,7 +2909,11 @@ int __init init_ext4_mballoc(void) void exit_ext4_mballoc(void) { - /* XXX: synchronize_rcu(); */ + /* + * Wait for completion of call_rcu()'s on ext4_pspace_cachep + * before destroying the slab cache. + */ + rcu_barrier(); kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); kmem_cache_destroy(ext4_free_ext_cachep); @@ -3457,7 +3468,8 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, * used in in-core bitmap. buddy must be generated from this bitmap * Need to be called with ext4 group lock held */ -static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, +static noinline_for_stack +void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); @@ -4215,14 +4227,9 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, ext4_get_group_no_and_offset(sb, goal, &group, &block); /* set up allocation goals */ + memset(ac, 0, sizeof(struct ext4_allocation_context)); ac->ac_b_ex.fe_logical = ar->logical; - ac->ac_b_ex.fe_group = 0; - ac->ac_b_ex.fe_start = 0; - ac->ac_b_ex.fe_len = 0; ac->ac_status = AC_STATUS_CONTINUE; - ac->ac_groups_scanned = 0; - ac->ac_ex_scanned = 0; - ac->ac_found = 0; ac->ac_sb = sb; ac->ac_inode = ar->inode; ac->ac_o_ex.fe_logical = ar->logical; @@ -4233,15 +4240,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, ac->ac_g_ex.fe_group = group; ac->ac_g_ex.fe_start = block; ac->ac_g_ex.fe_len = len; - ac->ac_f_ex.fe_len = 0; ac->ac_flags = ar->flags; - ac->ac_2order = 0; - ac->ac_criteria = 0; - ac->ac_pa = NULL; - ac->ac_bitmap_page = NULL; - ac->ac_buddy_page = NULL; - ac->alloc_semp = NULL; - ac->ac_lg = NULL; /* we have to define context: we'll we work with a file or * locality group. this is a policy, actually */ @@ -4509,10 +4508,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, } ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); - if (ac) { - ac->ac_sb = sb; - ac->ac_inode = ar->inode; - } else { + if (!ac) { ar->len = 0; *errp = -ENOMEM; goto out1; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index de04013d16ff..114abe5d2c1d 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2536,7 +2536,7 @@ const struct inode_operations ext4_dir_inode_operations = { .listxattr = ext4_listxattr, .removexattr = generic_removexattr, #endif - .permission = ext4_permission, + .check_acl = ext4_check_acl, .fiemap = ext4_fiemap, }; @@ -2548,5 +2548,5 @@ const struct inode_operations ext4_special_inode_operations = { .listxattr = ext4_listxattr, .removexattr = generic_removexattr, #endif - .permission = ext4_permission, + .check_acl = ext4_check_acl, }; diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 38ff75a0fe22..530b4ca01510 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -16,7 +16,6 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/time.h> -#include <linux/smp_lock.h> #include <linux/buffer_head.h> #include <linux/compat.h> #include <asm/uaccess.h> diff --git a/fs/fat/file.c b/fs/fat/file.c index b28ea646ff60..e8c159de236b 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -134,7 +134,7 @@ static int fat_file_release(struct inode *inode, struct file *filp) if ((filp->f_mode & FMODE_WRITE) && MSDOS_SB(inode->i_sb)->options.flush) { fat_flush_inodes(inode->i_sb, inode, NULL); - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); } return 0; } @@ -176,8 +176,26 @@ static int fat_cont_expand(struct inode *inode, loff_t size) inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; mark_inode_dirty(inode); - if (IS_SYNC(inode)) - err = sync_page_range_nolock(inode, mapping, start, count); + if (IS_SYNC(inode)) { + int err2; + + /* + * Opencode syncing since we don't have a file open to use + * standard fsync path. + */ + err = filemap_fdatawrite_range(mapping, start, + start + count - 1); + err2 = sync_mapping_buffers(mapping); + if (!err) + err = err2; + err2 = write_inode_now(inode, 1); + if (!err) + err = err2; + if (!err) { + err = filemap_fdatawait_range(mapping, start, + start + count - 1); + } + } out: return err; } diff --git a/fs/fat/misc.c b/fs/fat/misc.c index a6c20473dfd7..4e35be873e09 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -119,8 +119,8 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster) MSDOS_I(inode)->i_start = new_dclus; MSDOS_I(inode)->i_logstart = new_dclus; /* - * Since generic_osync_inode() synchronize later if - * this is not directory, we don't here. + * Since generic_write_sync() synchronizes regular files later, + * we sync here only directories. */ if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) { ret = fat_sync_inode(inode); diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index 82f88733b681..bbc94ae4fd77 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -9,7 +9,6 @@ #include <linux/module.h> #include <linux/time.h> #include <linux/buffer_head.h> -#include <linux/smp_lock.h> #include "fat.h" /* Characters that are undesirable in an MS-DOS file name */ diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 73471b7ecc8c..cb6e83557112 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -19,7 +19,6 @@ #include <linux/jiffies.h> #include <linux/ctype.h> #include <linux/slab.h> -#include <linux/smp_lock.h> #include <linux/buffer_head.h> #include <linux/namei.h> #include "fat.h" diff --git a/fs/fcntl.c b/fs/fcntl.c index a040b764f8e3..ae413086db97 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -19,7 +19,6 @@ #include <linux/signal.h> #include <linux/rcupdate.h> #include <linux/pid_namespace.h> -#include <linux/smp_lock.h> #include <asm/poll.h> #include <asm/siginfo.h> diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index cdbd1654e4cd..1e8af939b3e4 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -38,6 +38,7 @@ #include <linux/buffer_head.h> #include <linux/kernel.h> #include <linux/slab.h> +#include <linux/smp_lock.h> #include <linux/stat.h> #include <linux/vfs.h> #include <linux/mount.h> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index c54226be5294..8e1e5e19d21e 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -19,171 +19,245 @@ #include <linux/sched.h> #include <linux/fs.h> #include <linux/mm.h> +#include <linux/kthread.h> +#include <linux/freezer.h> #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/buffer_head.h> #include "internal.h" +#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info) -/** - * writeback_acquire - attempt to get exclusive writeback access to a device - * @bdi: the device's backing_dev_info structure - * - * It is a waste of resources to have more than one pdflush thread blocked on - * a single request queue. Exclusion at the request_queue level is obtained - * via a flag in the request_queue's backing_dev_info.state. - * - * Non-request_queue-backed address_spaces will share default_backing_dev_info, - * unless they implement their own. Which is somewhat inefficient, as this - * may prevent concurrent writeback against multiple devices. +/* + * We don't actually have pdflush, but this one is exported though /proc... + */ +int nr_pdflush_threads; + +/* + * Passed into wb_writeback(), essentially a subset of writeback_control + */ +struct wb_writeback_args { + long nr_pages; + struct super_block *sb; + enum writeback_sync_modes sync_mode; + int for_kupdate; + int range_cyclic; +}; + +/* + * Work items for the bdi_writeback threads */ -static int writeback_acquire(struct backing_dev_info *bdi) +struct bdi_work { + struct list_head list; /* pending work list */ + struct rcu_head rcu_head; /* for RCU free/clear of work */ + + unsigned long seen; /* threads that have seen this work */ + atomic_t pending; /* number of threads still to do work */ + + struct wb_writeback_args args; /* writeback arguments */ + + unsigned long state; /* flag bits, see WS_* */ +}; + +enum { + WS_USED_B = 0, + WS_ONSTACK_B, +}; + +#define WS_USED (1 << WS_USED_B) +#define WS_ONSTACK (1 << WS_ONSTACK_B) + +static inline bool bdi_work_on_stack(struct bdi_work *work) +{ + return test_bit(WS_ONSTACK_B, &work->state); +} + +static inline void bdi_work_init(struct bdi_work *work, + struct wb_writeback_args *args) { - return !test_and_set_bit(BDI_pdflush, &bdi->state); + INIT_RCU_HEAD(&work->rcu_head); + work->args = *args; + work->state = WS_USED; } /** * writeback_in_progress - determine whether there is writeback in progress * @bdi: the device's backing_dev_info structure. * - * Determine whether there is writeback in progress against a backing device. + * Determine whether there is writeback waiting to be handled against a + * backing device. */ int writeback_in_progress(struct backing_dev_info *bdi) { - return test_bit(BDI_pdflush, &bdi->state); + return !list_empty(&bdi->work_list); } -/** - * writeback_release - relinquish exclusive writeback access against a device. - * @bdi: the device's backing_dev_info structure - */ -static void writeback_release(struct backing_dev_info *bdi) +static void bdi_work_clear(struct bdi_work *work) { - BUG_ON(!writeback_in_progress(bdi)); - clear_bit(BDI_pdflush, &bdi->state); + clear_bit(WS_USED_B, &work->state); + smp_mb__after_clear_bit(); + /* + * work can have disappeared at this point. bit waitq functions + * should be able to tolerate this, provided bdi_sched_wait does + * not dereference it's pointer argument. + */ + wake_up_bit(&work->state, WS_USED_B); } -static noinline void block_dump___mark_inode_dirty(struct inode *inode) +static void bdi_work_free(struct rcu_head *head) { - if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { - struct dentry *dentry; - const char *name = "?"; + struct bdi_work *work = container_of(head, struct bdi_work, rcu_head); - dentry = d_find_alias(inode); - if (dentry) { - spin_lock(&dentry->d_lock); - name = (const char *) dentry->d_name.name; - } - printk(KERN_DEBUG - "%s(%d): dirtied inode %lu (%s) on %s\n", - current->comm, task_pid_nr(current), inode->i_ino, - name, inode->i_sb->s_id); - if (dentry) { - spin_unlock(&dentry->d_lock); - dput(dentry); - } - } + if (!bdi_work_on_stack(work)) + kfree(work); + else + bdi_work_clear(work); } -/** - * __mark_inode_dirty - internal function - * @inode: inode to mark - * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) - * Mark an inode as dirty. Callers should use mark_inode_dirty or - * mark_inode_dirty_sync. - * - * Put the inode on the super block's dirty list. - * - * CAREFUL! We mark it dirty unconditionally, but move it onto the - * dirty list only if it is hashed or if it refers to a blockdev. - * If it was not hashed, it will never be added to the dirty list - * even if it is later hashed, as it will have been marked dirty already. - * - * In short, make sure you hash any inodes _before_ you start marking - * them dirty. - * - * This function *must* be atomic for the I_DIRTY_PAGES case - - * set_page_dirty() is called under spinlock in several places. - * - * Note that for blockdevs, inode->dirtied_when represents the dirtying time of - * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of - * the kernel-internal blockdev inode represents the dirtying time of the - * blockdev's pages. This is why for I_DIRTY_PAGES we always use - * page->mapping->host, so the page-dirtying time is recorded in the internal - * blockdev inode. - */ -void __mark_inode_dirty(struct inode *inode, int flags) +static void wb_work_complete(struct bdi_work *work) { - struct super_block *sb = inode->i_sb; + const enum writeback_sync_modes sync_mode = work->args.sync_mode; + int onstack = bdi_work_on_stack(work); /* - * Don't do this for I_DIRTY_PAGES - that doesn't actually - * dirty the inode itself + * For allocated work, we can clear the done/seen bit right here. + * For on-stack work, we need to postpone both the clear and free + * to after the RCU grace period, since the stack could be invalidated + * as soon as bdi_work_clear() has done the wakeup. */ - if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { - if (sb->s_op->dirty_inode) - sb->s_op->dirty_inode(inode); - } + if (!onstack) + bdi_work_clear(work); + if (sync_mode == WB_SYNC_NONE || onstack) + call_rcu(&work->rcu_head, bdi_work_free); +} +static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work) +{ /* - * make sure that changes are seen by all cpus before we test i_state - * -- mikulas + * The caller has retrieved the work arguments from this work, + * drop our reference. If this is the last ref, delete and free it */ - smp_mb(); + if (atomic_dec_and_test(&work->pending)) { + struct backing_dev_info *bdi = wb->bdi; - /* avoid the locking if we can */ - if ((inode->i_state & flags) == flags) - return; + spin_lock(&bdi->wb_lock); + list_del_rcu(&work->list); + spin_unlock(&bdi->wb_lock); - if (unlikely(block_dump)) - block_dump___mark_inode_dirty(inode); + wb_work_complete(work); + } +} - spin_lock(&inode_lock); - if ((inode->i_state & flags) != flags) { - const int was_dirty = inode->i_state & I_DIRTY; +static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work) +{ + work->seen = bdi->wb_mask; + BUG_ON(!work->seen); + atomic_set(&work->pending, bdi->wb_cnt); + BUG_ON(!bdi->wb_cnt); - inode->i_state |= flags; + /* + * list_add_tail_rcu() contains the necessary barriers to + * make sure the above stores are seen before the item is + * noticed on the list + */ + spin_lock(&bdi->wb_lock); + list_add_tail_rcu(&work->list, &bdi->work_list); + spin_unlock(&bdi->wb_lock); - /* - * If the inode is being synced, just update its dirty state. - * The unlocker will place the inode on the appropriate - * superblock list, based upon its state. - */ - if (inode->i_state & I_SYNC) - goto out; + /* + * If the default thread isn't there, make sure we add it. When + * it gets created and wakes up, we'll run this work. + */ + if (unlikely(list_empty_careful(&bdi->wb_list))) + wake_up_process(default_backing_dev_info.wb.task); + else { + struct bdi_writeback *wb = &bdi->wb; - /* - * Only add valid (hashed) inodes to the superblock's - * dirty list. Add blockdev inodes as well. - */ - if (!S_ISBLK(inode->i_mode)) { - if (hlist_unhashed(&inode->i_hash)) - goto out; - } - if (inode->i_state & (I_FREEING|I_CLEAR)) - goto out; + if (wb->task) + wake_up_process(wb->task); + } +} - /* - * If the inode was already on s_dirty/s_io/s_more_io, don't - * reposition it (that would break s_dirty time-ordering). - */ - if (!was_dirty) { - inode->dirtied_when = jiffies; - list_move(&inode->i_list, &sb->s_dirty); - } +/* + * Used for on-stack allocated work items. The caller needs to wait until + * the wb threads have acked the work before it's safe to continue. + */ +static void bdi_wait_on_work_clear(struct bdi_work *work) +{ + wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait, + TASK_UNINTERRUPTIBLE); +} + +static void bdi_alloc_queue_work(struct backing_dev_info *bdi, + struct wb_writeback_args *args) +{ + struct bdi_work *work; + + /* + * This is WB_SYNC_NONE writeback, so if allocation fails just + * wakeup the thread for old dirty data writeback + */ + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (work) { + bdi_work_init(work, args); + bdi_queue_work(bdi, work); + } else { + struct bdi_writeback *wb = &bdi->wb; + + if (wb->task) + wake_up_process(wb->task); } -out: - spin_unlock(&inode_lock); } -EXPORT_SYMBOL(__mark_inode_dirty); +/** + * bdi_sync_writeback - start and wait for writeback + * @bdi: the backing device to write from + * @sb: write inodes from this super_block + * + * Description: + * This does WB_SYNC_ALL data integrity writeback and waits for the + * IO to complete. Callers must hold the sb s_umount semaphore for + * reading, to avoid having the super disappear before we are done. + */ +static void bdi_sync_writeback(struct backing_dev_info *bdi, + struct super_block *sb) +{ + struct wb_writeback_args args = { + .sb = sb, + .sync_mode = WB_SYNC_ALL, + .nr_pages = LONG_MAX, + .range_cyclic = 0, + }; + struct bdi_work work; -static int write_inode(struct inode *inode, int sync) + bdi_work_init(&work, &args); + work.state |= WS_ONSTACK; + + bdi_queue_work(bdi, &work); + bdi_wait_on_work_clear(&work); +} + +/** + * bdi_start_writeback - start writeback + * @bdi: the backing device to write from + * @nr_pages: the number of pages to write + * + * Description: + * This does WB_SYNC_NONE opportunistic writeback. The IO is only + * started when this function returns, we make no guarentees on + * completion. Caller need not hold sb s_umount semaphore. + * + */ +void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) { - if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) - return inode->i_sb->s_op->write_inode(inode, sync); - return 0; + struct wb_writeback_args args = { + .sync_mode = WB_SYNC_NONE, + .nr_pages = nr_pages, + .range_cyclic = 1, + }; + + bdi_alloc_queue_work(bdi, &args); } /* @@ -191,31 +265,32 @@ static int write_inode(struct inode *inode, int sync) * furthest end of its superblock's dirty-inode list. * * Before stamping the inode's ->dirtied_when, we check to see whether it is - * already the most-recently-dirtied inode on the s_dirty list. If that is + * already the most-recently-dirtied inode on the b_dirty list. If that is * the case then the inode must have been redirtied while it was being written * out and we don't reset its dirtied_when. */ static void redirty_tail(struct inode *inode) { - struct super_block *sb = inode->i_sb; + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; - if (!list_empty(&sb->s_dirty)) { - struct inode *tail_inode; + if (!list_empty(&wb->b_dirty)) { + struct inode *tail; - tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list); - if (time_before(inode->dirtied_when, - tail_inode->dirtied_when)) + tail = list_entry(wb->b_dirty.next, struct inode, i_list); + if (time_before(inode->dirtied_when, tail->dirtied_when)) inode->dirtied_when = jiffies; } - list_move(&inode->i_list, &sb->s_dirty); + list_move(&inode->i_list, &wb->b_dirty); } /* - * requeue inode for re-scanning after sb->s_io list is exhausted. + * requeue inode for re-scanning after bdi->b_io list is exhausted. */ static void requeue_io(struct inode *inode) { - list_move(&inode->i_list, &inode->i_sb->s_more_io); + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; + + list_move(&inode->i_list, &wb->b_more_io); } static void inode_sync_complete(struct inode *inode) @@ -262,20 +337,18 @@ static void move_expired_inodes(struct list_head *delaying_queue, /* * Queue all expired dirty inodes for io, eldest first. */ -static void queue_io(struct super_block *sb, - unsigned long *older_than_this) +static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) { - list_splice_init(&sb->s_more_io, sb->s_io.prev); - move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this); + list_splice_init(&wb->b_more_io, wb->b_io.prev); + move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); } -int sb_has_dirty_inodes(struct super_block *sb) +static int write_inode(struct inode *inode, int sync) { - return !list_empty(&sb->s_dirty) || - !list_empty(&sb->s_io) || - !list_empty(&sb->s_more_io); + if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) + return inode->i_sb->s_op->write_inode(inode, sync); + return 0; } -EXPORT_SYMBOL(sb_has_dirty_inodes); /* * Wait for writeback on an inode to complete. @@ -322,11 +395,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) if (inode->i_state & I_SYNC) { /* * If this inode is locked for writeback and we are not doing - * writeback-for-data-integrity, move it to s_more_io so that + * writeback-for-data-integrity, move it to b_more_io so that * writeback can proceed with the other inodes on s_io. * * We'll have another go at writing back this inode when we - * completed a full scan of s_io. + * completed a full scan of b_io. */ if (!wait) { requeue_io(inode); @@ -371,11 +444,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) /* * We didn't write back all the pages. nfs_writepages() * sometimes bales out without doing anything. Redirty - * the inode; Move it from s_io onto s_more_io/s_dirty. + * the inode; Move it from b_io onto b_more_io/b_dirty. */ /* * akpm: if the caller was the kupdate function we put - * this inode at the head of s_dirty so it gets first + * this inode at the head of b_dirty so it gets first * consideration. Otherwise, move it to the tail, for * the reasons described there. I'm not really sure * how much sense this makes. Presumably I had a good @@ -385,7 +458,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) if (wbc->for_kupdate) { /* * For the kupdate function we move the inode - * to s_more_io so it will get more writeout as + * to b_more_io so it will get more writeout as * soon as the queue becomes uncongested. */ inode->i_state |= I_DIRTY_PAGES; @@ -434,50 +507,84 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) } /* - * Write out a superblock's list of dirty inodes. A wait will be performed - * upon no inodes, all inodes or the final one, depending upon sync_mode. - * - * If older_than_this is non-NULL, then only write out inodes which - * had their first dirtying at a time earlier than *older_than_this. - * - * If we're a pdflush thread, then implement pdflush collision avoidance - * against the entire list. + * For WB_SYNC_NONE writeback, the caller does not have the sb pinned + * before calling writeback. So make sure that we do pin it, so it doesn't + * go away while we are writing inodes from it. * - * If `bdi' is non-zero then we're being asked to writeback a specific queue. - * This function assumes that the blockdev superblock's inodes are backed by - * a variety of queues, so all inodes are searched. For other superblocks, - * assume that all inodes are backed by the same queue. - * - * FIXME: this linear search could get expensive with many fileystems. But - * how to fix? We need to go from an address_space to all inodes which share - * a queue with that address_space. (Easy: have a global "dirty superblocks" - * list). - * - * The inodes to be written are parked on sb->s_io. They are moved back onto - * sb->s_dirty as they are selected for writing. This way, none can be missed - * on the writer throttling path, and we get decent balancing between many - * throttled threads: we don't want them all piling up on inode_sync_wait. + * Returns 0 if the super was successfully pinned (or pinning wasn't needed), + * 1 if we failed. */ -void generic_sync_sb_inodes(struct super_block *sb, +static int pin_sb_for_writeback(struct writeback_control *wbc, + struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + /* + * Caller must already hold the ref for this + */ + if (wbc->sync_mode == WB_SYNC_ALL) { + WARN_ON(!rwsem_is_locked(&sb->s_umount)); + return 0; + } + + spin_lock(&sb_lock); + sb->s_count++; + if (down_read_trylock(&sb->s_umount)) { + if (sb->s_root) { + spin_unlock(&sb_lock); + return 0; + } + /* + * umounted, drop rwsem again and fall through to failure + */ + up_read(&sb->s_umount); + } + + sb->s_count--; + spin_unlock(&sb_lock); + return 1; +} + +static void unpin_sb_for_writeback(struct writeback_control *wbc, + struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (wbc->sync_mode == WB_SYNC_ALL) + return; + + up_read(&sb->s_umount); + put_super(sb); +} + +static void writeback_inodes_wb(struct bdi_writeback *wb, struct writeback_control *wbc) { + struct super_block *sb = wbc->sb; + const int is_blkdev_sb = sb_is_blkdev_sb(sb); const unsigned long start = jiffies; /* livelock avoidance */ - int sync = wbc->sync_mode == WB_SYNC_ALL; spin_lock(&inode_lock); - if (!wbc->for_kupdate || list_empty(&sb->s_io)) - queue_io(sb, wbc->older_than_this); - while (!list_empty(&sb->s_io)) { - struct inode *inode = list_entry(sb->s_io.prev, + if (!wbc->for_kupdate || list_empty(&wb->b_io)) + queue_io(wb, wbc->older_than_this); + + while (!list_empty(&wb->b_io)) { + struct inode *inode = list_entry(wb->b_io.prev, struct inode, i_list); - struct address_space *mapping = inode->i_mapping; - struct backing_dev_info *bdi = mapping->backing_dev_info; long pages_skipped; - if (!bdi_cap_writeback_dirty(bdi)) { + /* + * super block given and doesn't match, skip this inode + */ + if (sb && sb != inode->i_sb) { + redirty_tail(inode); + continue; + } + + if (!bdi_cap_writeback_dirty(wb->bdi)) { redirty_tail(inode); - if (sb_is_blkdev_sb(sb)) { + if (is_blkdev_sb) { /* * Dirty memory-backed blockdev: the ramdisk * driver does this. Skip just this inode @@ -497,21 +604,14 @@ void generic_sync_sb_inodes(struct super_block *sb, continue; } - if (wbc->nonblocking && bdi_write_congested(bdi)) { + if (wbc->nonblocking && bdi_write_congested(wb->bdi)) { wbc->encountered_congestion = 1; - if (!sb_is_blkdev_sb(sb)) + if (!is_blkdev_sb) break; /* Skip a congested fs */ requeue_io(inode); continue; /* Skip a congested blockdev */ } - if (wbc->bdi && bdi != wbc->bdi) { - if (!sb_is_blkdev_sb(sb)) - break; /* fs has the wrong queue */ - requeue_io(inode); - continue; /* blockdev has wrong queue */ - } - /* * Was this inode dirtied after sync_sb_inodes was called? * This keeps sync from extra jobs and livelock. @@ -519,16 +619,16 @@ void generic_sync_sb_inodes(struct super_block *sb, if (inode_dirtied_after(inode, start)) break; - /* Is another pdflush already flushing this queue? */ - if (current_is_pdflush() && !writeback_acquire(bdi)) - break; + if (pin_sb_for_writeback(wbc, inode)) { + requeue_io(inode); + continue; + } BUG_ON(inode->i_state & (I_FREEING | I_CLEAR)); __iget(inode); pages_skipped = wbc->pages_skipped; writeback_single_inode(inode, wbc); - if (current_is_pdflush()) - writeback_release(bdi); + unpin_sb_for_writeback(wbc, inode); if (wbc->pages_skipped != pages_skipped) { /* * writeback is not making progress due to locked @@ -544,144 +644,520 @@ void generic_sync_sb_inodes(struct super_block *sb, wbc->more_io = 1; break; } - if (!list_empty(&sb->s_more_io)) + if (!list_empty(&wb->b_more_io)) wbc->more_io = 1; } - if (sync) { - struct inode *inode, *old_inode = NULL; + spin_unlock(&inode_lock); + /* Leave any unwritten inodes on b_io */ +} + +void writeback_inodes_wbc(struct writeback_control *wbc) +{ + struct backing_dev_info *bdi = wbc->bdi; + writeback_inodes_wb(&bdi->wb, wbc); +} + +/* + * The maximum number of pages to writeout in a single bdi flush/kupdate + * operation. We do this so we don't hold I_SYNC against an inode for + * enormous amounts of time, which would block a userspace task which has + * been forced to throttle against that inode. Also, the code reevaluates + * the dirty each time it has written this many pages. + */ +#define MAX_WRITEBACK_PAGES 1024 + +static inline bool over_bground_thresh(void) +{ + unsigned long background_thresh, dirty_thresh; + + get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); + + return (global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) >= background_thresh); +} + +/* + * Explicit flushing or periodic writeback of "old" data. + * + * Define "old": the first time one of an inode's pages is dirtied, we mark the + * dirtying-time in the inode's address_space. So this periodic writeback code + * just walks the superblock inode list, writing back any inodes which are + * older than a specific point in time. + * + * Try to run once per dirty_writeback_interval. But if a writeback event + * takes longer than a dirty_writeback_interval interval, then leave a + * one-second gap. + * + * older_than_this takes precedence over nr_to_write. So we'll only write back + * all dirty pages if they are all attached to "old" mappings. + */ +static long wb_writeback(struct bdi_writeback *wb, + struct wb_writeback_args *args) +{ + struct writeback_control wbc = { + .bdi = wb->bdi, + .sb = args->sb, + .sync_mode = args->sync_mode, + .older_than_this = NULL, + .for_kupdate = args->for_kupdate, + .range_cyclic = args->range_cyclic, + }; + unsigned long oldest_jif; + long wrote = 0; + + if (wbc.for_kupdate) { + wbc.older_than_this = &oldest_jif; + oldest_jif = jiffies - + msecs_to_jiffies(dirty_expire_interval * 10); + } + if (!wbc.range_cyclic) { + wbc.range_start = 0; + wbc.range_end = LLONG_MAX; + } + + for (;;) { /* - * Data integrity sync. Must wait for all pages under writeback, - * because there may have been pages dirtied before our sync - * call, but which had writeout started before we write it out. - * In which case, the inode may not be on the dirty list, but - * we still have to wait for that writeout. + * Don't flush anything for non-integrity writeback where + * no nr_pages was given */ - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - struct address_space *mapping; + if (!args->for_kupdate && args->nr_pages <= 0 && + args->sync_mode == WB_SYNC_NONE) + break; - if (inode->i_state & - (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) - continue; - mapping = inode->i_mapping; - if (mapping->nrpages == 0) + /* + * If no specific pages were given and this is just a + * periodic background writeout and we are below the + * background dirty threshold, don't do anything + */ + if (args->for_kupdate && args->nr_pages <= 0 && + !over_bground_thresh()) + break; + + wbc.more_io = 0; + wbc.encountered_congestion = 0; + wbc.nr_to_write = MAX_WRITEBACK_PAGES; + wbc.pages_skipped = 0; + writeback_inodes_wb(wb, &wbc); + args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; + wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; + + /* + * If we ran out of stuff to write, bail unless more_io got set + */ + if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { + if (wbc.more_io && !wbc.for_kupdate) continue; - __iget(inode); - spin_unlock(&inode_lock); - /* - * We hold a reference to 'inode' so it couldn't have - * been removed from s_inodes list while we dropped the - * inode_lock. We cannot iput the inode now as we can - * be holding the last reference and we cannot iput it - * under inode_lock. So we keep the reference and iput - * it later. - */ - iput(old_inode); - old_inode = inode; + break; + } + } + + return wrote; +} + +/* + * Return the next bdi_work struct that hasn't been processed by this + * wb thread yet. ->seen is initially set for each thread that exists + * for this device, when a thread first notices a piece of work it + * clears its bit. Depending on writeback type, the thread will notify + * completion on either receiving the work (WB_SYNC_NONE) or after + * it is done (WB_SYNC_ALL). + */ +static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi, + struct bdi_writeback *wb) +{ + struct bdi_work *work, *ret = NULL; + + rcu_read_lock(); + + list_for_each_entry_rcu(work, &bdi->work_list, list) { + if (!test_bit(wb->nr, &work->seen)) + continue; + clear_bit(wb->nr, &work->seen); + + ret = work; + break; + } + + rcu_read_unlock(); + return ret; +} + +static long wb_check_old_data_flush(struct bdi_writeback *wb) +{ + unsigned long expired; + long nr_pages; + + expired = wb->last_old_flush + + msecs_to_jiffies(dirty_writeback_interval * 10); + if (time_before(jiffies, expired)) + return 0; + + wb->last_old_flush = jiffies; + nr_pages = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) + + (inodes_stat.nr_inodes - inodes_stat.nr_unused); + + if (nr_pages) { + struct wb_writeback_args args = { + .nr_pages = nr_pages, + .sync_mode = WB_SYNC_NONE, + .for_kupdate = 1, + .range_cyclic = 1, + }; + + return wb_writeback(wb, &args); + } + + return 0; +} + +/* + * Retrieve work items and do the writeback they describe + */ +long wb_do_writeback(struct bdi_writeback *wb, int force_wait) +{ + struct backing_dev_info *bdi = wb->bdi; + struct bdi_work *work; + long wrote = 0; - filemap_fdatawait(mapping); + while ((work = get_next_work_item(bdi, wb)) != NULL) { + struct wb_writeback_args args = work->args; - cond_resched(); + /* + * Override sync mode, in case we must wait for completion + */ + if (force_wait) + work->args.sync_mode = args.sync_mode = WB_SYNC_ALL; - spin_lock(&inode_lock); + /* + * If this isn't a data integrity operation, just notify + * that we have seen this work and we are now starting it. + */ + if (args.sync_mode == WB_SYNC_NONE) + wb_clear_pending(wb, work); + + wrote += wb_writeback(wb, &args); + + /* + * This is a data integrity writeback, so only do the + * notification when we have completed the work. + */ + if (args.sync_mode == WB_SYNC_ALL) + wb_clear_pending(wb, work); + } + + /* + * Check for periodic writeback, kupdated() style + */ + wrote += wb_check_old_data_flush(wb); + + return wrote; +} + +/* + * Handle writeback of dirty data for the device backed by this bdi. Also + * wakes up periodically and does kupdated style flushing. + */ +int bdi_writeback_task(struct bdi_writeback *wb) +{ + unsigned long last_active = jiffies; + unsigned long wait_jiffies = -1UL; + long pages_written; + + while (!kthread_should_stop()) { + pages_written = wb_do_writeback(wb, 0); + + if (pages_written) + last_active = jiffies; + else if (wait_jiffies != -1UL) { + unsigned long max_idle; + + /* + * Longest period of inactivity that we tolerate. If we + * see dirty data again later, the task will get + * recreated automatically. + */ + max_idle = max(5UL * 60 * HZ, wait_jiffies); + if (time_after(jiffies, max_idle + last_active)) + break; } - spin_unlock(&inode_lock); - iput(old_inode); - } else - spin_unlock(&inode_lock); - return; /* Leave any unwritten inodes on s_io */ + wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); + schedule_timeout_interruptible(wait_jiffies); + try_to_freeze(); + } + + return 0; } -EXPORT_SYMBOL_GPL(generic_sync_sb_inodes); -static void sync_sb_inodes(struct super_block *sb, - struct writeback_control *wbc) +/* + * Schedule writeback for all backing devices. This does WB_SYNC_NONE + * writeback, for integrity writeback see bdi_sync_writeback(). + */ +static void bdi_writeback_all(struct super_block *sb, long nr_pages) { - generic_sync_sb_inodes(sb, wbc); + struct wb_writeback_args args = { + .sb = sb, + .nr_pages = nr_pages, + .sync_mode = WB_SYNC_NONE, + }; + struct backing_dev_info *bdi; + + rcu_read_lock(); + + list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { + if (!bdi_has_dirty_io(bdi)) + continue; + + bdi_alloc_queue_work(bdi, &args); + } + + rcu_read_unlock(); } /* - * Start writeback of dirty pagecache data against all unlocked inodes. + * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back + * the whole world. + */ +void wakeup_flusher_threads(long nr_pages) +{ + if (nr_pages == 0) + nr_pages = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS); + bdi_writeback_all(NULL, nr_pages); +} + +static noinline void block_dump___mark_inode_dirty(struct inode *inode) +{ + if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { + struct dentry *dentry; + const char *name = "?"; + + dentry = d_find_alias(inode); + if (dentry) { + spin_lock(&dentry->d_lock); + name = (const char *) dentry->d_name.name; + } + printk(KERN_DEBUG + "%s(%d): dirtied inode %lu (%s) on %s\n", + current->comm, task_pid_nr(current), inode->i_ino, + name, inode->i_sb->s_id); + if (dentry) { + spin_unlock(&dentry->d_lock); + dput(dentry); + } + } +} + +/** + * __mark_inode_dirty - internal function + * @inode: inode to mark + * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) + * Mark an inode as dirty. Callers should use mark_inode_dirty or + * mark_inode_dirty_sync. + * + * Put the inode on the super block's dirty list. * - * Note: - * We don't need to grab a reference to superblock here. If it has non-empty - * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed - * past sync_inodes_sb() until the ->s_dirty/s_io/s_more_io lists are all - * empty. Since __sync_single_inode() regains inode_lock before it finally moves - * inode from superblock lists we are OK. + * CAREFUL! We mark it dirty unconditionally, but move it onto the + * dirty list only if it is hashed or if it refers to a blockdev. + * If it was not hashed, it will never be added to the dirty list + * even if it is later hashed, as it will have been marked dirty already. * - * If `older_than_this' is non-zero then only flush inodes which have a - * flushtime older than *older_than_this. + * In short, make sure you hash any inodes _before_ you start marking + * them dirty. * - * If `bdi' is non-zero then we will scan the first inode against each - * superblock until we find the matching ones. One group will be the dirty - * inodes against a filesystem. Then when we hit the dummy blockdev superblock, - * sync_sb_inodes will seekout the blockdev which matches `bdi'. Maybe not - * super-efficient but we're about to do a ton of I/O... + * This function *must* be atomic for the I_DIRTY_PAGES case - + * set_page_dirty() is called under spinlock in several places. + * + * Note that for blockdevs, inode->dirtied_when represents the dirtying time of + * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of + * the kernel-internal blockdev inode represents the dirtying time of the + * blockdev's pages. This is why for I_DIRTY_PAGES we always use + * page->mapping->host, so the page-dirtying time is recorded in the internal + * blockdev inode. */ -void -writeback_inodes(struct writeback_control *wbc) +void __mark_inode_dirty(struct inode *inode, int flags) { - struct super_block *sb; + struct super_block *sb = inode->i_sb; - might_sleep(); - spin_lock(&sb_lock); -restart: - list_for_each_entry_reverse(sb, &super_blocks, s_list) { - if (sb_has_dirty_inodes(sb)) { - /* we're making our own get_super here */ - sb->s_count++; - spin_unlock(&sb_lock); - /* - * If we can't get the readlock, there's no sense in - * waiting around, most of the time the FS is going to - * be unmounted by the time it is released. - */ - if (down_read_trylock(&sb->s_umount)) { - if (sb->s_root) - sync_sb_inodes(sb, wbc); - up_read(&sb->s_umount); + /* + * Don't do this for I_DIRTY_PAGES - that doesn't actually + * dirty the inode itself + */ + if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { + if (sb->s_op->dirty_inode) + sb->s_op->dirty_inode(inode); + } + + /* + * make sure that changes are seen by all cpus before we test i_state + * -- mikulas + */ + smp_mb(); + + /* avoid the locking if we can */ + if ((inode->i_state & flags) == flags) + return; + + if (unlikely(block_dump)) + block_dump___mark_inode_dirty(inode); + + spin_lock(&inode_lock); + if ((inode->i_state & flags) != flags) { + const int was_dirty = inode->i_state & I_DIRTY; + + inode->i_state |= flags; + + /* + * If the inode is being synced, just update its dirty state. + * The unlocker will place the inode on the appropriate + * superblock list, based upon its state. + */ + if (inode->i_state & I_SYNC) + goto out; + + /* + * Only add valid (hashed) inodes to the superblock's + * dirty list. Add blockdev inodes as well. + */ + if (!S_ISBLK(inode->i_mode)) { + if (hlist_unhashed(&inode->i_hash)) + goto out; + } + if (inode->i_state & (I_FREEING|I_CLEAR)) + goto out; + + /* + * If the inode was already on b_dirty/b_io/b_more_io, don't + * reposition it (that would break b_dirty time-ordering). + */ + if (!was_dirty) { + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; + struct backing_dev_info *bdi = wb->bdi; + + if (bdi_cap_writeback_dirty(bdi) && + !test_bit(BDI_registered, &bdi->state)) { + WARN_ON(1); + printk(KERN_ERR "bdi-%s not registered\n", + bdi->name); } - spin_lock(&sb_lock); - if (__put_super_and_need_restart(sb)) - goto restart; + + inode->dirtied_when = jiffies; + list_move(&inode->i_list, &wb->b_dirty); } - if (wbc->nr_to_write <= 0) - break; } - spin_unlock(&sb_lock); +out: + spin_unlock(&inode_lock); } +EXPORT_SYMBOL(__mark_inode_dirty); /* - * writeback and wait upon the filesystem's dirty inodes. The caller will - * do this in two passes - one to write, and one to wait. + * Write out a superblock's list of dirty inodes. A wait will be performed + * upon no inodes, all inodes or the final one, depending upon sync_mode. + * + * If older_than_this is non-NULL, then only write out inodes which + * had their first dirtying at a time earlier than *older_than_this. + * + * If we're a pdlfush thread, then implement pdflush collision avoidance + * against the entire list. * - * A finite limit is set on the number of pages which will be written. - * To prevent infinite livelock of sys_sync(). + * If `bdi' is non-zero then we're being asked to writeback a specific queue. + * This function assumes that the blockdev superblock's inodes are backed by + * a variety of queues, so all inodes are searched. For other superblocks, + * assume that all inodes are backed by the same queue. * - * We add in the number of potentially dirty inodes, because each inode write - * can dirty pagecache in the underlying blockdev. + * The inodes to be written are parked on bdi->b_io. They are moved back onto + * bdi->b_dirty as they are selected for writing. This way, none can be missed + * on the writer throttling path, and we get decent balancing between many + * throttled threads: we don't want them all piling up on inode_sync_wait. */ -void sync_inodes_sb(struct super_block *sb, int wait) +static void wait_sb_inodes(struct super_block *sb) { - struct writeback_control wbc = { - .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, - .range_start = 0, - .range_end = LLONG_MAX, - }; + struct inode *inode, *old_inode = NULL; + + /* + * We need to be protected against the filesystem going from + * r/o to r/w or vice versa. + */ + WARN_ON(!rwsem_is_locked(&sb->s_umount)); + + spin_lock(&inode_lock); + + /* + * Data integrity sync. Must wait for all pages under writeback, + * because there may have been pages dirtied before our sync + * call, but which had writeout started before we write it out. + * In which case, the inode may not be on the dirty list, but + * we still have to wait for that writeout. + */ + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + struct address_space *mapping; + + if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) + continue; + mapping = inode->i_mapping; + if (mapping->nrpages == 0) + continue; + __iget(inode); + spin_unlock(&inode_lock); + /* + * We hold a reference to 'inode' so it couldn't have + * been removed from s_inodes list while we dropped the + * inode_lock. We cannot iput the inode now as we can + * be holding the last reference and we cannot iput it + * under inode_lock. So we keep the reference and iput + * it later. + */ + iput(old_inode); + old_inode = inode; + + filemap_fdatawait(mapping); + + cond_resched(); - if (!wait) { - unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); - unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); + spin_lock(&inode_lock); + } + spin_unlock(&inode_lock); + iput(old_inode); +} - wbc.nr_to_write = nr_dirty + nr_unstable + +/** + * writeback_inodes_sb - writeback dirty inodes from given super_block + * @sb: the superblock + * + * Start writeback on some inodes on this super_block. No guarantees are made + * on how many (if any) will be written, and this function does not wait + * for IO completion of submitted IO. The number of pages submitted is + * returned. + */ +void writeback_inodes_sb(struct super_block *sb) +{ + unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); + unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); + long nr_to_write; + + nr_to_write = nr_dirty + nr_unstable + (inodes_stat.nr_inodes - inodes_stat.nr_unused); - } else - wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */ - sync_sb_inodes(sb, &wbc); + bdi_writeback_all(sb, nr_to_write); +} +EXPORT_SYMBOL(writeback_inodes_sb); + +/** + * sync_inodes_sb - sync sb inode pages + * @sb: the superblock + * + * This function writes and waits on any dirty inode belonging to this + * super_block. The number of pages synced is returned. + */ +void sync_inodes_sb(struct super_block *sb) +{ + bdi_sync_writeback(sb->s_bdi, sb); + wait_sb_inodes(sb); } +EXPORT_SYMBOL(sync_inodes_sb); /** * write_inode_now - write an inode to disk @@ -737,57 +1213,3 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc) return ret; } EXPORT_SYMBOL(sync_inode); - -/** - * generic_osync_inode - flush all dirty data for a given inode to disk - * @inode: inode to write - * @mapping: the address_space that should be flushed - * @what: what to write and wait upon - * - * This can be called by file_write functions for files which have the - * O_SYNC flag set, to flush dirty writes to disk. - * - * @what is a bitmask, specifying which part of the inode's data should be - * written and waited upon. - * - * OSYNC_DATA: i_mapping's dirty data - * OSYNC_METADATA: the buffers at i_mapping->private_list - * OSYNC_INODE: the inode itself - */ - -int generic_osync_inode(struct inode *inode, struct address_space *mapping, int what) -{ - int err = 0; - int need_write_inode_now = 0; - int err2; - - if (what & OSYNC_DATA) - err = filemap_fdatawrite(mapping); - if (what & (OSYNC_METADATA|OSYNC_DATA)) { - err2 = sync_mapping_buffers(mapping); - if (!err) - err = err2; - } - if (what & OSYNC_DATA) { - err2 = filemap_fdatawait(mapping); - if (!err) - err = err2; - } - - spin_lock(&inode_lock); - if ((inode->i_state & I_DIRTY) && - ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC))) - need_write_inode_now = 1; - spin_unlock(&inode_lock); - - if (need_write_inode_now) { - err2 = write_inode_now(inode, 1); - if (!err) - err = err2; - } - else - inode_sync_wait(inode); - - return err; -} -EXPORT_SYMBOL(generic_osync_inode); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index b152761c1bf6..51d9e33d634f 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -286,8 +286,8 @@ __releases(&fc->lock) } if (fc->num_background == fc->congestion_threshold && fc->connected && fc->bdi_initialized) { - clear_bdi_congested(&fc->bdi, READ); - clear_bdi_congested(&fc->bdi, WRITE); + clear_bdi_congested(&fc->bdi, BLK_RW_SYNC); + clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC); } fc->num_background--; fc->active_background--; @@ -414,8 +414,8 @@ static void fuse_request_send_nowait_locked(struct fuse_conn *fc, fc->blocked = 1; if (fc->num_background == fc->congestion_threshold && fc->bdi_initialized) { - set_bdi_congested(&fc->bdi, READ); - set_bdi_congested(&fc->bdi, WRITE); + set_bdi_congested(&fc->bdi, BLK_RW_SYNC); + set_bdi_congested(&fc->bdi, BLK_RW_ASYNC); } list_add_tail(&req->list, &fc->bg_queue); flush_bg_queue(fc); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 8f9aca8d4ad5..6da947daabda 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -878,6 +878,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) { int err; + fc->bdi.name = "fuse"; fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; fc->bdi.unplug_io_fn = default_unplug_io_fn; /* fuse does it's own writeback accounting */ @@ -970,6 +971,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (err) goto err_put_conn; + sb->s_bdi = &fc->bdi; + /* Handle umasking inside the fuse code */ if (sb->s_flags & MS_POSIXACL) fc->dont_mask = 1; diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile index 3da2f1f4f738..21f7e46da4c0 100644 --- a/fs/gfs2/Makefile +++ b/fs/gfs2/Makefile @@ -1,6 +1,6 @@ EXTRA_CFLAGS := -I$(src) obj-$(CONFIG_GFS2_FS) += gfs2.o -gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \ +gfs2-y := acl.o bmap.o dir.o xattr.o glock.o \ glops.o inode.o log.o lops.o main.o meta_io.o \ aops.o dentry.o export.o file.o \ ops_fstype.o ops_inode.o quota.o \ diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index fa881bdc3d85..3fc4e3ac7d84 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -19,8 +19,7 @@ #include "gfs2.h" #include "incore.h" #include "acl.h" -#include "eaops.h" -#include "eattr.h" +#include "xattr.h" #include "glock.h" #include "inode.h" #include "meta_io.h" @@ -31,8 +30,7 @@ #define ACL_DEFAULT 0 int gfs2_acl_validate_set(struct gfs2_inode *ip, int access, - struct gfs2_ea_request *er, - int *remove, mode_t *mode) + struct gfs2_ea_request *er, int *remove, mode_t *mode) { struct posix_acl *acl; int error; @@ -83,30 +81,20 @@ int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access) return 0; } -static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl, - struct gfs2_ea_location *el, char **data, unsigned int *len) +static int acl_get(struct gfs2_inode *ip, const char *name, + struct posix_acl **acl, struct gfs2_ea_location *el, + char **datap, unsigned int *lenp) { - struct gfs2_ea_request er; - struct gfs2_ea_location el_this; + char *data; + unsigned int len; int error; + el->el_bh = NULL; + if (!ip->i_eattr) return 0; - memset(&er, 0, sizeof(struct gfs2_ea_request)); - if (access) { - er.er_name = GFS2_POSIX_ACL_ACCESS; - er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN; - } else { - er.er_name = GFS2_POSIX_ACL_DEFAULT; - er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN; - } - er.er_type = GFS2_EATYPE_SYS; - - if (!el) - el = &el_this; - - error = gfs2_ea_find(ip, &er, el); + error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, name, el); if (error) return error; if (!el->el_ea) @@ -114,32 +102,31 @@ static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl, if (!GFS2_EA_DATA_LEN(el->el_ea)) goto out; - er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea); - er.er_data = kmalloc(er.er_data_len, GFP_NOFS); + len = GFS2_EA_DATA_LEN(el->el_ea); + data = kmalloc(len, GFP_NOFS); error = -ENOMEM; - if (!er.er_data) + if (!data) goto out; - error = gfs2_ea_get_copy(ip, el, er.er_data); - if (error) + error = gfs2_ea_get_copy(ip, el, data, len); + if (error < 0) goto out_kfree; + error = 0; if (acl) { - *acl = posix_acl_from_xattr(er.er_data, er.er_data_len); + *acl = posix_acl_from_xattr(data, len); if (IS_ERR(*acl)) error = PTR_ERR(*acl); } out_kfree: - if (error || !data) - kfree(er.er_data); - else { - *data = er.er_data; - *len = er.er_data_len; + if (error || !datap) { + kfree(data); + } else { + *datap = data; + *lenp = len; } out: - if (error || el == &el_this) - brelse(el->el_bh); return error; } @@ -153,10 +140,12 @@ out: int gfs2_check_acl(struct inode *inode, int mask) { + struct gfs2_ea_location el; struct posix_acl *acl = NULL; int error; - error = acl_get(GFS2_I(inode), ACL_ACCESS, &acl, NULL, NULL, NULL); + error = acl_get(GFS2_I(inode), GFS2_POSIX_ACL_ACCESS, &acl, &el, NULL, NULL); + brelse(el.el_bh); if (error) return error; @@ -196,10 +185,12 @@ static int munge_mode(struct gfs2_inode *ip, mode_t mode) int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip) { + struct gfs2_ea_location el; struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct posix_acl *acl = NULL, *clone; - struct gfs2_ea_request er; mode_t mode = ip->i_inode.i_mode; + char *data = NULL; + unsigned int len; int error; if (!sdp->sd_args.ar_posix_acl) @@ -207,11 +198,8 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip) if (S_ISLNK(ip->i_inode.i_mode)) return 0; - memset(&er, 0, sizeof(struct gfs2_ea_request)); - er.er_type = GFS2_EATYPE_SYS; - - error = acl_get(dip, ACL_DEFAULT, &acl, NULL, - &er.er_data, &er.er_data_len); + error = acl_get(dip, GFS2_POSIX_ACL_DEFAULT, &acl, &el, &data, &len); + brelse(el.el_bh); if (error) return error; if (!acl) { @@ -229,9 +217,8 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip) acl = clone; if (S_ISDIR(ip->i_inode.i_mode)) { - er.er_name = GFS2_POSIX_ACL_DEFAULT; - er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN; - error = gfs2_system_eaops.eo_set(ip, &er); + error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS, + GFS2_POSIX_ACL_DEFAULT, data, len, 0); if (error) goto out; } @@ -239,21 +226,19 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip) error = posix_acl_create_masq(acl, &mode); if (error < 0) goto out; - if (error > 0) { - er.er_name = GFS2_POSIX_ACL_ACCESS; - er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN; - posix_acl_to_xattr(acl, er.er_data, er.er_data_len); - er.er_mode = mode; - er.er_flags = GFS2_ERF_MODE; - error = gfs2_system_eaops.eo_set(ip, &er); - if (error) - goto out; - } else - munge_mode(ip, mode); + if (error == 0) + goto munge; + posix_acl_to_xattr(acl, data, len); + error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS, + GFS2_POSIX_ACL_ACCESS, data, len, 0); + if (error) + goto out; +munge: + error = munge_mode(ip, mode); out: posix_acl_release(acl); - kfree(er.er_data); + kfree(data); return error; } @@ -265,9 +250,9 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr) unsigned int len; int error; - error = acl_get(ip, ACL_ACCESS, &acl, &el, &data, &len); + error = acl_get(ip, GFS2_POSIX_ACL_ACCESS, &acl, &el, &data, &len); if (error) - return error; + goto out_brelse; if (!acl) return gfs2_setattr_simple(ip, attr); @@ -286,8 +271,9 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr) out: posix_acl_release(acl); - brelse(el.el_bh); kfree(data); +out_brelse: + brelse(el.el_bh); return error; } diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 03ebb439ace0..7ebae9a4ecc0 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -624,6 +624,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, { struct gfs2_inode *ip = GFS2_I(mapping->host); struct gfs2_sbd *sdp = GFS2_SB(mapping->host); + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); unsigned int data_blocks = 0, ind_blocks = 0, rblocks; int alloc_required; int error = 0; @@ -637,6 +638,14 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, error = gfs2_glock_nq(&ip->i_gh); if (unlikely(error)) goto out_uninit; + if (&ip->i_inode == sdp->sd_rindex) { + error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, + GL_NOCACHE, &m_ip->i_gh); + if (unlikely(error)) { + gfs2_glock_dq(&ip->i_gh); + goto out_uninit; + } + } error = gfs2_write_alloc_required(ip, pos, len, &alloc_required); if (error) @@ -667,6 +676,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, rblocks += data_blocks ? data_blocks : 1; if (ind_blocks || data_blocks) rblocks += RES_STATFS + RES_QUOTA; + if (&ip->i_inode == sdp->sd_rindex) + rblocks += 2 * RES_STATFS; error = gfs2_trans_begin(sdp, rblocks, PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); @@ -712,6 +723,10 @@ out_alloc_put: gfs2_alloc_put(ip); } out_unlock: + if (&ip->i_inode == sdp->sd_rindex) { + gfs2_glock_dq(&m_ip->i_gh); + gfs2_holder_uninit(&m_ip->i_gh); + } gfs2_glock_dq(&ip->i_gh); out_uninit: gfs2_holder_uninit(&ip->i_gh); @@ -725,14 +740,21 @@ out_uninit: static void adjust_fs_space(struct inode *inode) { struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + struct buffer_head *m_bh, *l_bh; u64 fs_total, new_free; /* Total up the file system space, according to the latest rindex. */ fs_total = gfs2_ri_total(sdp); + if (gfs2_meta_inode_buffer(m_ip, &m_bh) != 0) + return; spin_lock(&sdp->sd_statfs_spin); + gfs2_statfs_change_in(m_sc, m_bh->b_data + + sizeof(struct gfs2_dinode)); if (fs_total > (m_sc->sc_total + l_sc->sc_total)) new_free = fs_total - (m_sc->sc_total + l_sc->sc_total); else @@ -741,6 +763,13 @@ static void adjust_fs_space(struct inode *inode) fs_warn(sdp, "File system extended by %llu blocks.\n", (unsigned long long)new_free); gfs2_statfs_change(sdp, new_free, new_free, 0); + + if (gfs2_meta_inode_buffer(l_ip, &l_bh) != 0) + goto out; + update_statfs(sdp, m_bh, l_bh); + brelse(l_bh); +out: + brelse(m_bh); } /** @@ -763,6 +792,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); u64 to = pos + copied; void *kaddr; unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode); @@ -794,6 +824,10 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, brelse(dibh); gfs2_trans_end(sdp); + if (inode == sdp->sd_rindex) { + gfs2_glock_dq(&m_ip->i_gh); + gfs2_holder_uninit(&m_ip->i_gh); + } gfs2_glock_dq(&ip->i_gh); gfs2_holder_uninit(&ip->i_gh); return copied; @@ -823,6 +857,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, struct inode *inode = page->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); struct buffer_head *dibh; struct gfs2_alloc *al = ip->i_alloc; unsigned int from = pos & (PAGE_CACHE_SIZE - 1); @@ -865,6 +900,10 @@ failed: gfs2_quota_unlock(ip); gfs2_alloc_put(ip); } + if (inode == sdp->sd_rindex) { + gfs2_glock_dq(&m_ip->i_gh); + gfs2_holder_uninit(&m_ip->i_gh); + } gfs2_glock_dq(&ip->i_gh); gfs2_holder_uninit(&ip->i_gh); return ret; diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index 022c66cd5606..91beddadd388 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -107,8 +107,26 @@ static int gfs2_dhash(struct dentry *dentry, struct qstr *str) return 0; } +static int gfs2_dentry_delete(struct dentry *dentry) +{ + struct gfs2_inode *ginode; + + if (!dentry->d_inode) + return 0; + + ginode = GFS2_I(dentry->d_inode); + if (!ginode->i_iopen_gh.gh_gl) + return 0; + + if (test_bit(GLF_DEMOTE, &ginode->i_iopen_gh.gh_gl->gl_flags)) + return 1; + + return 0; +} + const struct dentry_operations gfs2_dops = { .d_revalidate = gfs2_drevalidate, .d_hash = gfs2_dhash, + .d_delete = gfs2_dentry_delete, }; diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c deleted file mode 100644 index dee9b03e5b37..000000000000 --- a/fs/gfs2/eaops.c +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/completion.h> -#include <linux/buffer_head.h> -#include <linux/capability.h> -#include <linux/xattr.h> -#include <linux/gfs2_ondisk.h> -#include <asm/uaccess.h> - -#include "gfs2.h" -#include "incore.h" -#include "acl.h" -#include "eaops.h" -#include "eattr.h" -#include "util.h" - -/** - * gfs2_ea_name2type - get the type of the ea, and truncate type from the name - * @namep: ea name, possibly with type appended - * - * Returns: GFS2_EATYPE_XXX - */ - -unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name) -{ - unsigned int type; - - if (strncmp(name, "system.", 7) == 0) { - type = GFS2_EATYPE_SYS; - if (truncated_name) - *truncated_name = name + sizeof("system.") - 1; - } else if (strncmp(name, "user.", 5) == 0) { - type = GFS2_EATYPE_USR; - if (truncated_name) - *truncated_name = name + sizeof("user.") - 1; - } else if (strncmp(name, "security.", 9) == 0) { - type = GFS2_EATYPE_SECURITY; - if (truncated_name) - *truncated_name = name + sizeof("security.") - 1; - } else { - type = GFS2_EATYPE_UNUSED; - if (truncated_name) - *truncated_name = NULL; - } - - return type; -} - -static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er) -{ - if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) && - !GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len) && - !capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl == 0 && - (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) || - GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len))) - return -EOPNOTSUPP; - - return gfs2_ea_get_i(ip, er); -} - -static int system_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er) -{ - int remove = 0; - int error; - - if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) { - if (!(er->er_flags & GFS2_ERF_MODE)) { - er->er_mode = ip->i_inode.i_mode; - er->er_flags |= GFS2_ERF_MODE; - } - error = gfs2_acl_validate_set(ip, 1, er, - &remove, &er->er_mode); - if (error) - return error; - error = gfs2_ea_set_i(ip, er); - if (error) - return error; - if (remove) - gfs2_ea_remove_i(ip, er); - return 0; - - } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) { - error = gfs2_acl_validate_set(ip, 0, er, - &remove, NULL); - if (error) - return error; - if (!remove) - error = gfs2_ea_set_i(ip, er); - else { - error = gfs2_ea_remove_i(ip, er); - if (error == -ENODATA) - error = 0; - } - return error; - } - - return -EPERM; -} - -static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er) -{ - if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) { - int error = gfs2_acl_validate_remove(ip, 1); - if (error) - return error; - - } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) { - int error = gfs2_acl_validate_remove(ip, 0); - if (error) - return error; - - } else - return -EPERM; - - return gfs2_ea_remove_i(ip, er); -} - -static const struct gfs2_eattr_operations gfs2_user_eaops = { - .eo_get = gfs2_ea_get_i, - .eo_set = gfs2_ea_set_i, - .eo_remove = gfs2_ea_remove_i, - .eo_name = "user", -}; - -const struct gfs2_eattr_operations gfs2_system_eaops = { - .eo_get = system_eo_get, - .eo_set = system_eo_set, - .eo_remove = system_eo_remove, - .eo_name = "system", -}; - -static const struct gfs2_eattr_operations gfs2_security_eaops = { - .eo_get = gfs2_ea_get_i, - .eo_set = gfs2_ea_set_i, - .eo_remove = gfs2_ea_remove_i, - .eo_name = "security", -}; - -const struct gfs2_eattr_operations *gfs2_ea_ops[] = { - NULL, - &gfs2_user_eaops, - &gfs2_system_eaops, - &gfs2_security_eaops, -}; - diff --git a/fs/gfs2/eaops.h b/fs/gfs2/eaops.h deleted file mode 100644 index da2f7fbbb40d..000000000000 --- a/fs/gfs2/eaops.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#ifndef __EAOPS_DOT_H__ -#define __EAOPS_DOT_H__ - -struct gfs2_ea_request; -struct gfs2_inode; - -struct gfs2_eattr_operations { - int (*eo_get) (struct gfs2_inode *ip, struct gfs2_ea_request *er); - int (*eo_set) (struct gfs2_inode *ip, struct gfs2_ea_request *er); - int (*eo_remove) (struct gfs2_inode *ip, struct gfs2_ea_request *er); - char *eo_name; -}; - -unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name); - -extern const struct gfs2_eattr_operations gfs2_system_eaops; - -extern const struct gfs2_eattr_operations *gfs2_ea_ops[]; - -#endif /* __EAOPS_DOT_H__ */ - diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index 9200ef221716..d15876e9aa26 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -143,17 +143,14 @@ static struct dentry *gfs2_get_parent(struct dentry *child) } static struct dentry *gfs2_get_dentry(struct super_block *sb, - struct gfs2_inum_host *inum) + struct gfs2_inum_host *inum) { struct gfs2_sbd *sdp = sb->s_fs_info; - struct gfs2_holder i_gh, ri_gh, rgd_gh; - struct gfs2_rgrpd *rgd; + struct gfs2_holder i_gh; struct inode *inode; struct dentry *dentry; int error; - /* System files? */ - inode = gfs2_ilookup(sb, inum->no_addr); if (inode) { if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) { @@ -168,29 +165,11 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, if (error) return ERR_PTR(error); - error = gfs2_rindex_hold(sdp, &ri_gh); + error = gfs2_check_blk_type(sdp, inum->no_addr, GFS2_BLKST_DINODE); if (error) goto fail; - error = -EINVAL; - rgd = gfs2_blk2rgrpd(sdp, inum->no_addr); - if (!rgd) - goto fail_rindex; - - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh); - if (error) - goto fail_rindex; - - error = -ESTALE; - if (gfs2_get_block_type(rgd, inum->no_addr) != GFS2_BLKST_DINODE) - goto fail_rgd; - - gfs2_glock_dq_uninit(&rgd_gh); - gfs2_glock_dq_uninit(&ri_gh); - - inode = gfs2_inode_lookup(sb, DT_UNKNOWN, - inum->no_addr, - 0, 0); + inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0, 0); if (IS_ERR(inode)) { error = PTR_ERR(inode); goto fail; @@ -224,13 +203,6 @@ out_inode: if (!IS_ERR(dentry)) dentry->d_op = &gfs2_dops; return dentry; - -fail_rgd: - gfs2_glock_dq_uninit(&rgd_gh); - -fail_rindex: - gfs2_glock_dq_uninit(&ri_gh); - fail: gfs2_glock_dq_uninit(&i_gh); return ERR_PTR(error); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 73318a3ce6f1..166f38fbd246 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -38,7 +38,6 @@ #include "rgrp.h" #include "trans.h" #include "util.h" -#include "eaops.h" /** * gfs2_llseek - seek to a location in a file diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 297421c0427a..8b674b1f3a55 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -63,6 +63,7 @@ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int static DECLARE_RWSEM(gfs2_umount_flush_sem); static struct dentry *gfs2_root; static struct workqueue_struct *glock_workqueue; +struct workqueue_struct *gfs2_delete_workqueue; static LIST_HEAD(lru_list); static atomic_t lru_count = ATOMIC_INIT(0); static DEFINE_SPINLOCK(lru_lock); @@ -167,13 +168,33 @@ static void glock_free(struct gfs2_glock *gl) * */ -static void gfs2_glock_hold(struct gfs2_glock *gl) +void gfs2_glock_hold(struct gfs2_glock *gl) { GLOCK_BUG_ON(gl, atomic_read(&gl->gl_ref) == 0); atomic_inc(&gl->gl_ref); } /** + * demote_ok - Check to see if it's ok to unlock a glock + * @gl: the glock + * + * Returns: 1 if it's ok + */ + +static int demote_ok(const struct gfs2_glock *gl) +{ + const struct gfs2_glock_operations *glops = gl->gl_ops; + + if (gl->gl_state == LM_ST_UNLOCKED) + return 0; + if (!list_empty(&gl->gl_holders)) + return 0; + if (glops->go_demote_ok) + return glops->go_demote_ok(gl); + return 1; +} + +/** * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list * @gl: the glock * @@ -181,8 +202,13 @@ static void gfs2_glock_hold(struct gfs2_glock *gl) static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl) { + int may_reclaim; + may_reclaim = (demote_ok(gl) && + (atomic_read(&gl->gl_ref) == 1 || + (gl->gl_name.ln_type == LM_TYPE_INODE && + atomic_read(&gl->gl_ref) <= 2))); spin_lock(&lru_lock); - if (list_empty(&gl->gl_lru) && gl->gl_state != LM_ST_UNLOCKED) { + if (list_empty(&gl->gl_lru) && may_reclaim) { list_add_tail(&gl->gl_lru, &lru_list); atomic_inc(&lru_count); } @@ -190,6 +216,21 @@ static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl) } /** + * gfs2_glock_put_nolock() - Decrement reference count on glock + * @gl: The glock to put + * + * This function should only be used if the caller has its own reference + * to the glock, in addition to the one it is dropping. + */ + +void gfs2_glock_put_nolock(struct gfs2_glock *gl) +{ + if (atomic_dec_and_test(&gl->gl_ref)) + GLOCK_BUG_ON(gl, 1); + gfs2_glock_schedule_for_reclaim(gl); +} + +/** * gfs2_glock_put() - Decrement reference count on glock * @gl: The glock to put * @@ -214,9 +255,9 @@ int gfs2_glock_put(struct gfs2_glock *gl) rv = 1; goto out; } - /* 1 for being hashed, 1 for having state != LM_ST_UNLOCKED */ - if (atomic_read(&gl->gl_ref) == 2) - gfs2_glock_schedule_for_reclaim(gl); + spin_lock(&gl->gl_spin); + gfs2_glock_schedule_for_reclaim(gl); + spin_unlock(&gl->gl_spin); write_unlock(gl_lock_addr(gl->gl_hash)); out: return rv; @@ -398,7 +439,7 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state) if (held2) gfs2_glock_hold(gl); else - gfs2_glock_put(gl); + gfs2_glock_put_nolock(gl); } gl->gl_state = new_state; @@ -633,12 +674,35 @@ out: out_sched: gfs2_glock_hold(gl); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gfs2_glock_put(gl); + gfs2_glock_put_nolock(gl); out_unlock: clear_bit(GLF_LOCK, &gl->gl_flags); goto out; } +static void delete_work_func(struct work_struct *work) +{ + struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete); + struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_inode *ip = NULL; + struct inode *inode; + u64 no_addr = 0; + + spin_lock(&gl->gl_spin); + ip = (struct gfs2_inode *)gl->gl_object; + if (ip) + no_addr = ip->i_no_addr; + spin_unlock(&gl->gl_spin); + if (ip) { + inode = gfs2_ilookup(sdp->sd_vfs, no_addr); + if (inode) { + d_prune_aliases(inode); + iput(inode); + } + } + gfs2_glock_put(gl); +} + static void glock_work_func(struct work_struct *work) { unsigned long delay = 0; @@ -717,6 +781,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, gl->gl_sbd = sdp; gl->gl_aspace = NULL; INIT_DELAYED_WORK(&gl->gl_work, glock_work_func); + INIT_WORK(&gl->gl_delete, delete_work_func); /* If this glock protects actual on-disk data or metadata blocks, create a VFS inode to manage the pages/buffers holding them. */ @@ -858,6 +923,8 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state, gl->gl_demote_state != state) { gl->gl_demote_state = LM_ST_UNLOCKED; } + if (gl->gl_ops->go_callback) + gl->gl_ops->go_callback(gl); trace_gfs2_demote_rq(gl); } @@ -1274,33 +1341,12 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) gfs2_glock_put(gl); } -/** - * demote_ok - Check to see if it's ok to unlock a glock - * @gl: the glock - * - * Returns: 1 if it's ok - */ - -static int demote_ok(const struct gfs2_glock *gl) -{ - const struct gfs2_glock_operations *glops = gl->gl_ops; - - if (gl->gl_state == LM_ST_UNLOCKED) - return 0; - if (!list_empty(&gl->gl_holders)) - return 0; - if (glops->go_demote_ok) - return glops->go_demote_ok(gl); - return 1; -} - static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask) { struct gfs2_glock *gl; int may_demote; int nr_skipped = 0; - int got_ref = 0; LIST_HEAD(skipped); if (nr == 0) @@ -1315,37 +1361,29 @@ static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask) list_del_init(&gl->gl_lru); atomic_dec(&lru_count); + /* Check if glock is about to be freed */ + if (atomic_read(&gl->gl_ref) == 0) + continue; + /* Test for being demotable */ if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { gfs2_glock_hold(gl); - got_ref = 1; spin_unlock(&lru_lock); spin_lock(&gl->gl_spin); may_demote = demote_ok(gl); - spin_unlock(&gl->gl_spin); - clear_bit(GLF_LOCK, &gl->gl_flags); if (may_demote) { handle_callback(gl, LM_ST_UNLOCKED, 0); nr--; - if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gfs2_glock_put(gl); - got_ref = 0; } + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gfs2_glock_put_nolock(gl); + spin_unlock(&gl->gl_spin); + clear_bit(GLF_LOCK, &gl->gl_flags); spin_lock(&lru_lock); - if (may_demote) - continue; - } - if (list_empty(&gl->gl_lru) && - (atomic_read(&gl->gl_ref) <= (2 + got_ref))) { - nr_skipped++; - list_add(&gl->gl_lru, &skipped); - } - if (got_ref) { - spin_unlock(&lru_lock); - gfs2_glock_put(gl); - spin_lock(&lru_lock); - got_ref = 0; + continue; } + nr_skipped++; + list_add(&gl->gl_lru, &skipped); } list_splice(&skipped, &lru_list); atomic_add(nr_skipped, &lru_count); @@ -1727,6 +1765,11 @@ int __init gfs2_glock_init(void) glock_workqueue = create_workqueue("glock_workqueue"); if (IS_ERR(glock_workqueue)) return PTR_ERR(glock_workqueue); + gfs2_delete_workqueue = create_workqueue("delete_workqueue"); + if (IS_ERR(gfs2_delete_workqueue)) { + destroy_workqueue(glock_workqueue); + return PTR_ERR(gfs2_delete_workqueue); + } register_shrinker(&glock_shrinker); @@ -1737,6 +1780,7 @@ void gfs2_glock_exit(void) { unregister_shrinker(&glock_shrinker); destroy_workqueue(glock_workqueue); + destroy_workqueue(gfs2_delete_workqueue); } static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi) diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index a602a28f6f08..c609894ec0d0 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -143,6 +143,7 @@ struct lm_lockops { #define GLR_TRYFAILED 13 +extern struct workqueue_struct *gfs2_delete_workqueue; static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl) { struct gfs2_holder *gh; @@ -191,6 +192,8 @@ static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl) int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, const struct gfs2_glock_operations *glops, int create, struct gfs2_glock **glp); +void gfs2_glock_hold(struct gfs2_glock *gl); +void gfs2_glock_put_nolock(struct gfs2_glock *gl); int gfs2_glock_put(struct gfs2_glock *gl); void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, struct gfs2_holder *gh); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index d5e4ab155ca0..6985eef06c39 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -323,6 +323,7 @@ static void trans_go_sync(struct gfs2_glock *gl) if (gl->gl_state != LM_ST_UNLOCKED && test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { + flush_workqueue(gfs2_delete_workqueue); gfs2_meta_syncfs(sdp); gfs2_log_shutdown(sdp); } @@ -372,6 +373,25 @@ static int trans_go_demote_ok(const struct gfs2_glock *gl) return 0; } +/** + * iopen_go_callback - schedule the dcache entry for the inode to be deleted + * @gl: the glock + * + * gl_spin lock is held while calling this + */ +static void iopen_go_callback(struct gfs2_glock *gl) +{ + struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object; + + if (gl->gl_demote_state == LM_ST_UNLOCKED && + gl->gl_state == LM_ST_SHARED && + ip && test_bit(GIF_USER, &ip->i_flags)) { + gfs2_glock_hold(gl); + if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0) + gfs2_glock_put_nolock(gl); + } +} + const struct gfs2_glock_operations gfs2_meta_glops = { .go_type = LM_TYPE_META, }; @@ -406,6 +426,7 @@ const struct gfs2_glock_operations gfs2_trans_glops = { const struct gfs2_glock_operations gfs2_iopen_glops = { .go_type = LM_TYPE_IOPEN, + .go_callback = iopen_go_callback, }; const struct gfs2_glock_operations gfs2_flock_glops = { diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 225347fbff3c..6edb423f90b3 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -159,6 +159,7 @@ struct gfs2_glock_operations { int (*go_lock) (struct gfs2_holder *gh); void (*go_unlock) (struct gfs2_holder *gh); int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl); + void (*go_callback) (struct gfs2_glock *gl); const int go_type; const unsigned long go_min_hold_time; }; @@ -228,6 +229,7 @@ struct gfs2_glock { struct list_head gl_ail_list; atomic_t gl_ail_count; struct delayed_work gl_work; + struct work_struct gl_delete; }; #define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */ @@ -404,6 +406,12 @@ struct gfs2_statfs_change_host { #define GFS2_DATA_WRITEBACK 1 #define GFS2_DATA_ORDERED 2 +#define GFS2_ERRORS_DEFAULT GFS2_ERRORS_WITHDRAW +#define GFS2_ERRORS_WITHDRAW 0 +#define GFS2_ERRORS_CONTINUE 1 /* place holder for future feature */ +#define GFS2_ERRORS_RO 2 /* place holder for future feature */ +#define GFS2_ERRORS_PANIC 3 + struct gfs2_args { char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */ char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */ @@ -420,6 +428,7 @@ struct gfs2_args { unsigned int ar_data:2; /* ordered/writeback */ unsigned int ar_meta:1; /* mount metafs */ unsigned int ar_discard:1; /* discard requests */ + unsigned int ar_errors:2; /* errors=withdraw | panic */ int ar_commit; /* Commit interval */ }; @@ -487,7 +496,6 @@ struct gfs2_sb_host { */ struct lm_lockstruct { - u32 ls_id; unsigned int ls_jid; unsigned int ls_first; unsigned int ls_first_done; @@ -539,18 +547,12 @@ struct gfs2_sbd { struct dentry *sd_root_dir; struct inode *sd_jindex; - struct inode *sd_inum_inode; struct inode *sd_statfs_inode; - struct inode *sd_ir_inode; struct inode *sd_sc_inode; struct inode *sd_qc_inode; struct inode *sd_rindex; struct inode *sd_quota_inode; - /* Inum stuff */ - - struct mutex sd_inum_mutex; - /* StatFS stuff */ spinlock_t sd_statfs_spin; @@ -578,7 +580,6 @@ struct gfs2_sbd { struct gfs2_holder sd_journal_gh; struct gfs2_holder sd_jinode_gh; - struct gfs2_holder sd_ir_gh; struct gfs2_holder sd_sc_gh; struct gfs2_holder sd_qc_gh; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 2f94bd723698..fb15d3b1f409 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -24,7 +24,7 @@ #include "acl.h" #include "bmap.h" #include "dir.h" -#include "eattr.h" +#include "xattr.h" #include "glock.h" #include "glops.h" #include "inode.h" @@ -519,139 +519,6 @@ out: return inode ? inode : ERR_PTR(error); } -static void gfs2_inum_range_in(struct gfs2_inum_range_host *ir, const void *buf) -{ - const struct gfs2_inum_range *str = buf; - - ir->ir_start = be64_to_cpu(str->ir_start); - ir->ir_length = be64_to_cpu(str->ir_length); -} - -static void gfs2_inum_range_out(const struct gfs2_inum_range_host *ir, void *buf) -{ - struct gfs2_inum_range *str = buf; - - str->ir_start = cpu_to_be64(ir->ir_start); - str->ir_length = cpu_to_be64(ir->ir_length); -} - -static int pick_formal_ino_1(struct gfs2_sbd *sdp, u64 *formal_ino) -{ - struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode); - struct buffer_head *bh; - struct gfs2_inum_range_host ir; - int error; - - error = gfs2_trans_begin(sdp, RES_DINODE, 0); - if (error) - return error; - mutex_lock(&sdp->sd_inum_mutex); - - error = gfs2_meta_inode_buffer(ip, &bh); - if (error) { - mutex_unlock(&sdp->sd_inum_mutex); - gfs2_trans_end(sdp); - return error; - } - - gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode)); - - if (ir.ir_length) { - *formal_ino = ir.ir_start++; - ir.ir_length--; - gfs2_trans_add_bh(ip->i_gl, bh, 1); - gfs2_inum_range_out(&ir, - bh->b_data + sizeof(struct gfs2_dinode)); - brelse(bh); - mutex_unlock(&sdp->sd_inum_mutex); - gfs2_trans_end(sdp); - return 0; - } - - brelse(bh); - - mutex_unlock(&sdp->sd_inum_mutex); - gfs2_trans_end(sdp); - - return 1; -} - -static int pick_formal_ino_2(struct gfs2_sbd *sdp, u64 *formal_ino) -{ - struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode); - struct gfs2_inode *m_ip = GFS2_I(sdp->sd_inum_inode); - struct gfs2_holder gh; - struct buffer_head *bh; - struct gfs2_inum_range_host ir; - int error; - - error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); - if (error) - return error; - - error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0); - if (error) - goto out; - mutex_lock(&sdp->sd_inum_mutex); - - error = gfs2_meta_inode_buffer(ip, &bh); - if (error) - goto out_end_trans; - - gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode)); - - if (!ir.ir_length) { - struct buffer_head *m_bh; - u64 x, y; - __be64 z; - - error = gfs2_meta_inode_buffer(m_ip, &m_bh); - if (error) - goto out_brelse; - - z = *(__be64 *)(m_bh->b_data + sizeof(struct gfs2_dinode)); - x = y = be64_to_cpu(z); - ir.ir_start = x; - ir.ir_length = GFS2_INUM_QUANTUM; - x += GFS2_INUM_QUANTUM; - if (x < y) - gfs2_consist_inode(m_ip); - z = cpu_to_be64(x); - gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1); - *(__be64 *)(m_bh->b_data + sizeof(struct gfs2_dinode)) = z; - - brelse(m_bh); - } - - *formal_ino = ir.ir_start++; - ir.ir_length--; - - gfs2_trans_add_bh(ip->i_gl, bh, 1); - gfs2_inum_range_out(&ir, bh->b_data + sizeof(struct gfs2_dinode)); - -out_brelse: - brelse(bh); -out_end_trans: - mutex_unlock(&sdp->sd_inum_mutex); - gfs2_trans_end(sdp); -out: - gfs2_glock_dq_uninit(&gh); - return error; -} - -static int pick_formal_ino(struct gfs2_sbd *sdp, u64 *inum) -{ - int error; - - error = pick_formal_ino_1(sdp, inum); - if (error <= 0) - return error; - - error = pick_formal_ino_2(sdp, inum); - - return error; -} - /** * create_ok - OK to create a new on-disk inode here? * @dip: Directory in which dinode is to be created @@ -731,7 +598,7 @@ static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation) if (error) goto out_ipreserv; - *no_addr = gfs2_alloc_di(dip, generation); + error = gfs2_alloc_di(dip, no_addr, generation); gfs2_trans_end(sdp); @@ -924,7 +791,6 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip) size_t len; void *value; char *name; - struct gfs2_ea_request er; err = security_inode_init_security(&ip->i_inode, &dip->i_inode, &name, &value, &len); @@ -935,16 +801,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip) return err; } - memset(&er, 0, sizeof(struct gfs2_ea_request)); - - er.er_type = GFS2_EATYPE_SECURITY; - er.er_name = name; - er.er_data = value; - er.er_name_len = strlen(name); - er.er_data_len = len; - - err = gfs2_ea_set_i(ip, &er); - + err = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SECURITY, name, value, len, 0); kfree(value); kfree(name); @@ -991,13 +848,10 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, if (error) goto fail_gunlock; - error = pick_formal_ino(sdp, &inum.no_formal_ino); - if (error) - goto fail_gunlock; - error = alloc_dinode(dip, &inum.no_addr, &generation); if (error) goto fail_gunlock; + inum.no_formal_ino = generation; error = gfs2_glock_nq_num(sdp, inum.no_addr, &gfs2_inode_glops, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); @@ -1008,9 +862,8 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, if (error) goto fail_gunlock2; - inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), - inum.no_addr, - inum.no_formal_ino, 0); + inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr, + inum.no_formal_ino, 0); if (IS_ERR(inode)) goto fail_gunlock2; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 7bc3c45cd676..52fb6c048981 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -84,7 +84,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) gfs2_tune_init(&sdp->sd_tune); - mutex_init(&sdp->sd_inum_mutex); spin_lock_init(&sdp->sd_statfs_spin); spin_lock_init(&sdp->sd_rindex_spin); @@ -833,21 +832,12 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) if (error) goto fail; - /* Read in the master inode number inode */ - sdp->sd_inum_inode = gfs2_lookup_simple(master, "inum"); - if (IS_ERR(sdp->sd_inum_inode)) { - error = PTR_ERR(sdp->sd_inum_inode); - fs_err(sdp, "can't read in inum inode: %d\n", error); - goto fail_journal; - } - - /* Read in the master statfs inode */ sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs"); if (IS_ERR(sdp->sd_statfs_inode)) { error = PTR_ERR(sdp->sd_statfs_inode); fs_err(sdp, "can't read in statfs inode: %d\n", error); - goto fail_inum; + goto fail_journal; } /* Read in the resource index inode */ @@ -876,8 +866,6 @@ fail_rindex: iput(sdp->sd_rindex); fail_statfs: iput(sdp->sd_statfs_inode); -fail_inum: - iput(sdp->sd_inum_inode); fail_journal: init_journal(sdp, UNDO); fail: @@ -905,20 +893,12 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo) return error; } - sprintf(buf, "inum_range%u", sdp->sd_jdesc->jd_jid); - sdp->sd_ir_inode = gfs2_lookup_simple(pn, buf); - if (IS_ERR(sdp->sd_ir_inode)) { - error = PTR_ERR(sdp->sd_ir_inode); - fs_err(sdp, "can't find local \"ir\" file: %d\n", error); - goto fail; - } - sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid); sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf); if (IS_ERR(sdp->sd_sc_inode)) { error = PTR_ERR(sdp->sd_sc_inode); fs_err(sdp, "can't find local \"sc\" file: %d\n", error); - goto fail_ir_i; + goto fail; } sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid); @@ -932,27 +912,16 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo) iput(pn); pn = NULL; - ip = GFS2_I(sdp->sd_ir_inode); - error = gfs2_glock_nq_init(ip->i_gl, - LM_ST_EXCLUSIVE, 0, - &sdp->sd_ir_gh); - if (error) { - fs_err(sdp, "can't lock local \"ir\" file: %d\n", error); - goto fail_qc_i; - } - ip = GFS2_I(sdp->sd_sc_inode); - error = gfs2_glock_nq_init(ip->i_gl, - LM_ST_EXCLUSIVE, 0, + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &sdp->sd_sc_gh); if (error) { fs_err(sdp, "can't lock local \"sc\" file: %d\n", error); - goto fail_ir_gh; + goto fail_qc_i; } ip = GFS2_I(sdp->sd_qc_inode); - error = gfs2_glock_nq_init(ip->i_gl, - LM_ST_EXCLUSIVE, 0, + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &sdp->sd_qc_gh); if (error) { fs_err(sdp, "can't lock local \"qc\" file: %d\n", error); @@ -965,14 +934,10 @@ fail_qc_gh: gfs2_glock_dq_uninit(&sdp->sd_qc_gh); fail_ut_gh: gfs2_glock_dq_uninit(&sdp->sd_sc_gh); -fail_ir_gh: - gfs2_glock_dq_uninit(&sdp->sd_ir_gh); fail_qc_i: iput(sdp->sd_qc_inode); fail_ut_i: iput(sdp->sd_sc_inode); -fail_ir_i: - iput(sdp->sd_ir_inode); fail: if (pn) iput(pn); @@ -1063,7 +1028,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) ls->ls_ops = lm; ls->ls_first = 1; - ls->ls_id = 0; for (options = args->ar_hostdata; (o = strsep(&options, ":")); ) { substring_t tmp[MAX_OPT_ARGS]; @@ -1081,10 +1045,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) ls->ls_jid = option; break; case Opt_id: - ret = match_int(&tmp[0], &option); - if (ret) - goto hostdata_error; - ls->ls_id = option; + /* Obsolete, but left for backward compat purposes */ break; case Opt_first: ret = match_int(&tmp[0], &option); @@ -1133,6 +1094,17 @@ void gfs2_lm_unmount(struct gfs2_sbd *sdp) lm->lm_unmount(sdp); } +void gfs2_online_uevent(struct gfs2_sbd *sdp) +{ + struct super_block *sb = sdp->sd_vfs; + char ro[20]; + char spectator[20]; + char *envp[] = { ro, spectator, NULL }; + sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0); + sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0); + kobject_uevent_env(&sdp->sd_kobj, KOBJ_ONLINE, envp); +} + /** * fill_super - Read in superblock * @sb: The VFS superblock @@ -1157,6 +1129,7 @@ static int fill_super(struct super_block *sb, void *data, int silent) sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT; sdp->sd_args.ar_data = GFS2_DATA_DEFAULT; sdp->sd_args.ar_commit = 60; + sdp->sd_args.ar_errors = GFS2_ERRORS_DEFAULT; error = gfs2_mount_args(sdp, &sdp->sd_args, data); if (error) { @@ -1174,6 +1147,7 @@ static int fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = GFS2_MAGIC; sb->s_op = &gfs2_super_ops; sb->s_export_op = &gfs2_export_ops; + sb->s_xattr = gfs2_xattr_handlers; sb->s_time_gran = 1; sb->s_maxbytes = MAX_LFS_FILESIZE; @@ -1236,7 +1210,7 @@ static int fill_super(struct super_block *sb, void *data, int silent) } gfs2_glock_dq_uninit(&mount_gh); - + gfs2_online_uevent(sdp); return 0; fail_threads: diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index f8bd20baf99c..c3ac18054057 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -26,8 +26,7 @@ #include "acl.h" #include "bmap.h" #include "dir.h" -#include "eaops.h" -#include "eattr.h" +#include "xattr.h" #include "glock.h" #include "inode.h" #include "meta_io.h" @@ -349,7 +348,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0); if (error) - goto out_rgrp; + goto out_gunlock; error = gfs2_dir_del(dip, &dentry->d_name); if (error) @@ -1302,60 +1301,53 @@ static int gfs2_setxattr(struct dentry *dentry, const char *name, const void *data, size_t size, int flags) { struct inode *inode = dentry->d_inode; - struct gfs2_ea_request er; - - memset(&er, 0, sizeof(struct gfs2_ea_request)); - er.er_type = gfs2_ea_name2type(name, &er.er_name); - if (er.er_type == GFS2_EATYPE_UNUSED) - return -EOPNOTSUPP; - er.er_data = (char *)data; - er.er_name_len = strlen(er.er_name); - er.er_data_len = size; - er.er_flags = flags; - - gfs2_assert_warn(GFS2_SB(inode), !(er.er_flags & GFS2_ERF_MODE)); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; - return gfs2_ea_set(GFS2_I(inode), &er); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + ret = gfs2_glock_nq(&gh); + if (ret == 0) { + ret = generic_setxattr(dentry, name, data, size, flags); + gfs2_glock_dq(&gh); + } + gfs2_holder_uninit(&gh); + return ret; } static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name, void *data, size_t size) { - struct gfs2_ea_request er; - - memset(&er, 0, sizeof(struct gfs2_ea_request)); - er.er_type = gfs2_ea_name2type(name, &er.er_name); - if (er.er_type == GFS2_EATYPE_UNUSED) - return -EOPNOTSUPP; - er.er_data = data; - er.er_name_len = strlen(er.er_name); - er.er_data_len = size; - - return gfs2_ea_get(GFS2_I(dentry->d_inode), &er); -} - -static ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size) -{ - struct gfs2_ea_request er; - - memset(&er, 0, sizeof(struct gfs2_ea_request)); - er.er_data = (size) ? buffer : NULL; - er.er_data_len = size; + struct inode *inode = dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; - return gfs2_ea_list(GFS2_I(dentry->d_inode), &er); + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); + ret = gfs2_glock_nq(&gh); + if (ret == 0) { + ret = generic_getxattr(dentry, name, data, size); + gfs2_glock_dq(&gh); + } + gfs2_holder_uninit(&gh); + return ret; } static int gfs2_removexattr(struct dentry *dentry, const char *name) { - struct gfs2_ea_request er; - - memset(&er, 0, sizeof(struct gfs2_ea_request)); - er.er_type = gfs2_ea_name2type(name, &er.er_name); - if (er.er_type == GFS2_EATYPE_UNUSED) - return -EOPNOTSUPP; - er.er_name_len = strlen(er.er_name); + struct inode *inode = dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; - return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + ret = gfs2_glock_nq(&gh); + if (ret == 0) { + ret = generic_removexattr(dentry, name); + gfs2_glock_dq(&gh); + } + gfs2_holder_uninit(&gh); + return ret; } static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index daa4ae341a29..28c590b7c9da 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -285,27 +285,19 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd) } tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes; - if (count[1] + count[2] != tmp) { + if (count[1] != tmp) { if (gfs2_consist_rgrpd(rgd)) fs_err(sdp, "used data mismatch: %u != %u\n", count[1], tmp); return; } - if (count[3] != rgd->rd_dinodes) { + if (count[2] + count[3] != rgd->rd_dinodes) { if (gfs2_consist_rgrpd(rgd)) fs_err(sdp, "used metadata mismatch: %u != %u\n", - count[3], rgd->rd_dinodes); + count[2] + count[3], rgd->rd_dinodes); return; } - - if (count[2] > count[3]) { - if (gfs2_consist_rgrpd(rgd)) - fs_err(sdp, "unlinked inodes > inodes: %u\n", - count[2]); - return; - } - } static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block) @@ -865,7 +857,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, goto start_new_extent; if ((start + nr_sects) != blk) { rv = blkdev_issue_discard(bdev, start, - nr_sects, GFP_NOFS); + nr_sects, GFP_NOFS, + DISCARD_FL_BARRIER); if (rv) goto fail; nr_sects = 0; @@ -879,7 +872,8 @@ start_new_extent: } } if (nr_sects) { - rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS); + rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, + DISCARD_FL_BARRIER); if (rv) goto fail; } @@ -961,7 +955,8 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al) * Returns: The inode, if one has been found */ -static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked) +static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, + u64 skip) { struct inode *inode; u32 goal = 0, block; @@ -985,6 +980,8 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked) goal++; if (*last_unlinked != NO_BLOCK && no_addr <= *last_unlinked) continue; + if (no_addr == skip) + continue; *last_unlinked = no_addr; inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN, no_addr, -1, 1); @@ -1104,7 +1101,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) if (try_rgrp_fit(rgd, al)) goto out; if (rgd->rd_flags & GFS2_RDF_CHECK) - inode = try_rgrp_unlink(rgd, last_unlinked); + inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); if (!rg_locked) gfs2_glock_dq_uninit(&al->al_rgd_gh); if (inode) @@ -1138,7 +1135,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) if (try_rgrp_fit(rgd, al)) goto out; if (rgd->rd_flags & GFS2_RDF_CHECK) - inode = try_rgrp_unlink(rgd, last_unlinked); + inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); if (!rg_locked) gfs2_glock_dq_uninit(&al->al_rgd_gh); if (inode) @@ -1261,7 +1258,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip) * Returns: The block type (GFS2_BLKST_*) */ -unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block) +static unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block) { struct gfs2_bitmap *bi = NULL; u32 length, rgrp_block, buf_block; @@ -1464,6 +1461,16 @@ int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl) return 0; } +static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n", + (unsigned long long)rgd->rd_addr); + fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n"); + gfs2_rgrp_dump(NULL, rgd->rd_gl); + rgd->rd_flags |= GFS2_RDF_ERROR; +} + /** * gfs2_alloc_block - Allocate one or more blocks * @ip: the inode to allocate the block for @@ -1525,22 +1532,20 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n) return 0; rgrp_error: - fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n", - (unsigned long long)rgd->rd_addr); - fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n"); - gfs2_rgrp_dump(NULL, rgd->rd_gl); - rgd->rd_flags |= GFS2_RDF_ERROR; + gfs2_rgrp_error(rgd); return -EIO; } /** * gfs2_alloc_di - Allocate a dinode * @dip: the directory that the inode is going in + * @bn: the block number which is allocated + * @generation: the generation number of the inode * - * Returns: the block allocated + * Returns: 0 on success or error */ -u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation) +int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation) { struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct gfs2_alloc *al = dip->i_alloc; @@ -1551,16 +1556,21 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation) blk = rgblk_search(rgd, rgd->rd_last_alloc, GFS2_BLKST_FREE, GFS2_BLKST_DINODE, &n); - BUG_ON(blk == BFITNOENT); - rgd->rd_last_alloc = blk; + /* Since all blocks are reserved in advance, this shouldn't happen */ + if (blk == BFITNOENT) + goto rgrp_error; + rgd->rd_last_alloc = blk; block = rgd->rd_data0 + blk; + if (rgd->rd_free == 0) + goto rgrp_error; - gfs2_assert_withdraw(sdp, rgd->rd_free); rgd->rd_free--; rgd->rd_dinodes++; *generation = rgd->rd_igeneration++; + if (*generation == 0) + *generation = rgd->rd_igeneration++; gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); @@ -1573,7 +1583,12 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation) rgd->rd_free_clone--; spin_unlock(&sdp->sd_rindex_spin); trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE); - return block; + *bn = block; + return 0; + +rgrp_error: + gfs2_rgrp_error(rgd); + return -EIO; } /** @@ -1681,6 +1696,46 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) } /** + * gfs2_check_blk_type - Check the type of a block + * @sdp: The superblock + * @no_addr: The block number to check + * @type: The block type we are looking for + * + * Returns: 0 if the block type matches the expected type + * -ESTALE if it doesn't match + * or -ve errno if something went wrong while checking + */ + +int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type) +{ + struct gfs2_rgrpd *rgd; + struct gfs2_holder ri_gh, rgd_gh; + int error; + + error = gfs2_rindex_hold(sdp, &ri_gh); + if (error) + goto fail; + + error = -EINVAL; + rgd = gfs2_blk2rgrpd(sdp, no_addr); + if (!rgd) + goto fail_rindex; + + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh); + if (error) + goto fail_rindex; + + if (gfs2_get_block_type(rgd, no_addr) != type) + error = -ESTALE; + + gfs2_glock_dq_uninit(&rgd_gh); +fail_rindex: + gfs2_glock_dq_uninit(&ri_gh); +fail: + return error; +} + +/** * gfs2_rlist_add - add a RG to a list of RGs * @sdp: the filesystem * @rlist: the list of resource groups diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 1e76ff0f3e00..b4106ddaaa98 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -44,15 +44,15 @@ gfs2_inplace_reserve_i((ip), __FILE__, __LINE__) extern void gfs2_inplace_release(struct gfs2_inode *ip); -extern unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block); - extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n); -extern u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation); +extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation); extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen); extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); extern void gfs2_unlink_di(struct inode *inode); +extern int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, + unsigned int type); struct gfs2_rgrp_list { unsigned int rl_rgrps; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 0a6801336470..0ec3ec672de1 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -38,7 +38,7 @@ #include "trans.h" #include "util.h" #include "sys.h" -#include "eattr.h" +#include "xattr.h" #define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x) @@ -68,6 +68,8 @@ enum { Opt_discard, Opt_nodiscard, Opt_commit, + Opt_err_withdraw, + Opt_err_panic, Opt_error, }; @@ -97,6 +99,8 @@ static const match_table_t tokens = { {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, {Opt_commit, "commit=%d"}, + {Opt_err_withdraw, "errors=withdraw"}, + {Opt_err_panic, "errors=panic"}, {Opt_error, NULL} }; @@ -152,6 +156,11 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options) args->ar_localcaching = 1; break; case Opt_debug: + if (args->ar_errors == GFS2_ERRORS_PANIC) { + fs_info(sdp, "-o debug and -o errors=panic " + "are mutually exclusive.\n"); + return -EINVAL; + } args->ar_debug = 1; break; case Opt_nodebug: @@ -205,6 +214,17 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options) return rv ? rv : -EINVAL; } break; + case Opt_err_withdraw: + args->ar_errors = GFS2_ERRORS_WITHDRAW; + break; + case Opt_err_panic: + if (args->ar_debug) { + fs_info(sdp, "-o debug and -o errors=panic " + "are mutually exclusive.\n"); + return -EINVAL; + } + args->ar_errors = GFS2_ERRORS_PANIC; + break; case Opt_error: default: fs_info(sdp, "invalid mount option: %s\n", o); @@ -353,7 +373,7 @@ fail: return error; } -static void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf) +void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf) { const struct gfs2_statfs_change *str = buf; @@ -441,6 +461,29 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, brelse(l_bh); } +void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, + struct buffer_head *l_bh) +{ + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); + struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; + struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + + gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1); + + spin_lock(&sdp->sd_statfs_spin); + m_sc->sc_total += l_sc->sc_total; + m_sc->sc_free += l_sc->sc_free; + m_sc->sc_dinodes += l_sc->sc_dinodes; + memset(l_sc, 0, sizeof(struct gfs2_statfs_change)); + memset(l_bh->b_data + sizeof(struct gfs2_dinode), + 0, sizeof(struct gfs2_statfs_change)); + spin_unlock(&sdp->sd_statfs_spin); + + gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1); + gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode)); +} + int gfs2_statfs_sync(struct gfs2_sbd *sdp) { struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); @@ -477,19 +520,7 @@ int gfs2_statfs_sync(struct gfs2_sbd *sdp) if (error) goto out_bh2; - gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1); - - spin_lock(&sdp->sd_statfs_spin); - m_sc->sc_total += l_sc->sc_total; - m_sc->sc_free += l_sc->sc_free; - m_sc->sc_dinodes += l_sc->sc_dinodes; - memset(l_sc, 0, sizeof(struct gfs2_statfs_change)); - memset(l_bh->b_data + sizeof(struct gfs2_dinode), - 0, sizeof(struct gfs2_statfs_change)); - spin_unlock(&sdp->sd_statfs_spin); - - gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1); - gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode)); + update_statfs(sdp, m_bh, l_bh); gfs2_trans_end(sdp); @@ -680,6 +711,7 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) struct gfs2_holder t_gh; int error; + flush_workqueue(gfs2_delete_workqueue); gfs2_quota_sync(sdp); gfs2_statfs_sync(sdp); @@ -756,7 +788,6 @@ restart: /* Release stuff */ iput(sdp->sd_jindex); - iput(sdp->sd_inum_inode); iput(sdp->sd_statfs_inode); iput(sdp->sd_rindex); iput(sdp->sd_quota_inode); @@ -767,10 +798,8 @@ restart: if (!sdp->sd_args.ar_spectator) { gfs2_glock_dq_uninit(&sdp->sd_journal_gh); gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); - gfs2_glock_dq_uninit(&sdp->sd_ir_gh); gfs2_glock_dq_uninit(&sdp->sd_sc_gh); gfs2_glock_dq_uninit(&sdp->sd_qc_gh); - iput(sdp->sd_ir_inode); iput(sdp->sd_sc_inode); iput(sdp->sd_qc_inode); } @@ -1072,6 +1101,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) gt->gt_log_flush_secs = args.ar_commit; spin_unlock(>->gt_spin); + gfs2_online_uevent(sdp); return 0; } @@ -1213,6 +1243,22 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) lfsecs = sdp->sd_tune.gt_log_flush_secs; if (lfsecs != 60) seq_printf(s, ",commit=%d", lfsecs); + if (args->ar_errors != GFS2_ERRORS_DEFAULT) { + const char *state; + + switch (args->ar_errors) { + case GFS2_ERRORS_WITHDRAW: + state = "withdraw"; + break; + case GFS2_ERRORS_PANIC: + state = "panic"; + break; + default: + state = "unknown"; + break; + } + seq_printf(s, ",errors=%s", state); + } return 0; } @@ -1240,6 +1286,10 @@ static void gfs2_delete_inode(struct inode *inode) goto out; } + error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED); + if (error) + goto out_truncate; + gfs2_glock_dq_wait(&ip->i_iopen_gh); gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh); error = gfs2_glock_nq(&ip->i_iopen_gh); diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index b56413e3e40d..235db3682885 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -25,7 +25,7 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) return x; } -void gfs2_jindex_free(struct gfs2_sbd *sdp); +extern void gfs2_jindex_free(struct gfs2_sbd *sdp); extern int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *data); @@ -36,10 +36,14 @@ extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename, struct gfs2_inode **ipp); extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp); - +extern void gfs2_online_uevent(struct gfs2_sbd *sdp); extern int gfs2_statfs_init(struct gfs2_sbd *sdp); extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, s64 dinodes); +extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, + const void *buf); +extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, + struct buffer_head *l_bh); extern int gfs2_statfs_sync(struct gfs2_sbd *sdp); extern int gfs2_freeze_fs(struct gfs2_sbd *sdp); @@ -50,6 +54,7 @@ extern struct file_system_type gfs2meta_fs_type; extern const struct export_operations gfs2_export_ops; extern const struct super_operations gfs2_super_ops; extern const struct dentry_operations gfs2_dops; +extern struct xattr_handler *gfs2_xattr_handlers[]; #endif /* __SUPER_DOT_H__ */ diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 23419dc3027b..446329728d52 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -16,6 +16,7 @@ #include <linux/kobject.h> #include <asm/uaccess.h> #include <linux/gfs2_ondisk.h> +#include <linux/genhd.h> #include "gfs2.h" #include "incore.h" @@ -319,12 +320,6 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len) return ret; } -static ssize_t lkid_show(struct gfs2_sbd *sdp, char *buf) -{ - struct lm_lockstruct *ls = &sdp->sd_lockstruct; - return sprintf(buf, "%u\n", ls->ls_id); -} - static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; @@ -386,22 +381,20 @@ static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf) #define GDLM_ATTR(_name,_mode,_show,_store) \ static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store) -GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); -GDLM_ATTR(block, 0644, block_show, block_store); -GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store); -GDLM_ATTR(id, 0444, lkid_show, NULL); -GDLM_ATTR(jid, 0444, jid_show, NULL); -GDLM_ATTR(first, 0444, lkfirst_show, NULL); -GDLM_ATTR(first_done, 0444, first_done_show, NULL); -GDLM_ATTR(recover, 0200, NULL, recover_store); -GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); -GDLM_ATTR(recover_status, 0444, recover_status_show, NULL); +GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); +GDLM_ATTR(block, 0644, block_show, block_store); +GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store); +GDLM_ATTR(jid, 0444, jid_show, NULL); +GDLM_ATTR(first, 0444, lkfirst_show, NULL); +GDLM_ATTR(first_done, 0444, first_done_show, NULL); +GDLM_ATTR(recover, 0600, NULL, recover_store); +GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); +GDLM_ATTR(recover_status, 0444, recover_status_show, NULL); static struct attribute *lock_module_attrs[] = { &gdlm_attr_proto_name.attr, &gdlm_attr_block.attr, &gdlm_attr_withdraw.attr, - &gdlm_attr_id.attr, &gdlm_attr_jid.attr, &gdlm_attr_first.attr, &gdlm_attr_first_done.attr, @@ -519,7 +512,14 @@ static struct attribute_group lock_module_group = { int gfs2_sys_fs_add(struct gfs2_sbd *sdp) { + struct super_block *sb = sdp->sd_vfs; int error; + char ro[20]; + char spectator[20]; + char *envp[] = { ro, spectator, NULL }; + + sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0); + sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0); sdp->sd_kobj.kset = gfs2_kset; error = kobject_init_and_add(&sdp->sd_kobj, &gfs2_ktype, NULL, @@ -535,9 +535,17 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp) if (error) goto fail_tune; - kobject_uevent(&sdp->sd_kobj, KOBJ_ADD); + error = sysfs_create_link(&sdp->sd_kobj, + &disk_to_dev(sb->s_bdev->bd_disk)->kobj, + "device"); + if (error) + goto fail_lock_module; + + kobject_uevent_env(&sdp->sd_kobj, KOBJ_ADD, envp); return 0; +fail_lock_module: + sysfs_remove_group(&sdp->sd_kobj, &lock_module_group); fail_tune: sysfs_remove_group(&sdp->sd_kobj, &tune_group); fail_reg: @@ -549,12 +557,12 @@ fail: void gfs2_sys_fs_del(struct gfs2_sbd *sdp) { + sysfs_remove_link(&sdp->sd_kobj, "device"); sysfs_remove_group(&sdp->sd_kobj, &tune_group); sysfs_remove_group(&sdp->sd_kobj, &lock_module_group); kobject_put(&sdp->sd_kobj); } - static int gfs2_uevent(struct kset *kset, struct kobject *kobj, struct kobj_uevent_env *env) { @@ -563,6 +571,8 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj, add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name); add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); + if (!sdp->sd_args.ar_spectator) + add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid); if (gfs2_uuid_valid(uuid)) { add_uevent_var(env, "UUID=%02X%02X%02X%02X-%02X%02X-%02X%02X-" "%02X%02X-%02X%02X%02X%02X%02X%02X", @@ -578,7 +588,6 @@ static struct kset_uevent_ops gfs2_uevent_ops = { .uevent = gfs2_uevent, }; - int gfs2_sys_init(void) { gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj); diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index 98d6ef1c1dc0..148d55c14171 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -1,12 +1,11 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM gfs2 + #if !defined(_TRACE_GFS2_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_GFS2_H #include <linux/tracepoint.h> -#undef TRACE_SYSTEM -#define TRACE_SYSTEM gfs2 -#define TRACE_INCLUDE_FILE trace_gfs2 - #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/dlmconstants.h> @@ -403,5 +402,6 @@ TRACE_EVENT(gfs2_block_alloc, /* This part must be outside protection */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_gfs2 #include <trace/define_trace.h> diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 9d12b1118ba0..f6a7efa34eb9 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -38,24 +38,30 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...) const struct lm_lockops *lm = ls->ls_ops; va_list args; - if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW && + test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags)) return 0; va_start(args, fmt); vprintk(fmt, args); va_end(args); - fs_err(sdp, "about to withdraw this file system\n"); - BUG_ON(sdp->sd_args.ar_debug); + if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) { + fs_err(sdp, "about to withdraw this file system\n"); + BUG_ON(sdp->sd_args.ar_debug); - kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE); + kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE); - if (lm->lm_unmount) { - fs_err(sdp, "telling LM to unmount\n"); - lm->lm_unmount(sdp); + if (lm->lm_unmount) { + fs_err(sdp, "telling LM to unmount\n"); + lm->lm_unmount(sdp); + } + fs_err(sdp, "withdrawn\n"); + dump_stack(); } - fs_err(sdp, "withdrawn\n"); - dump_stack(); + + if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC) + panic("GFS2: fsid=%s: panic requested.\n", sdp->sd_fsname); return -1; } @@ -93,17 +99,24 @@ int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion, gfs2_tune_get(sdp, gt_complain_secs) * HZ)) return -2; - printk(KERN_WARNING - "GFS2: fsid=%s: warning: assertion \"%s\" failed\n" - "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", - sdp->sd_fsname, assertion, - sdp->sd_fsname, function, file, line); + if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) + printk(KERN_WARNING + "GFS2: fsid=%s: warning: assertion \"%s\" failed\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, assertion, + sdp->sd_fsname, function, file, line); if (sdp->sd_args.ar_debug) BUG(); else dump_stack(); + if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC) + panic("GFS2: fsid=%s: warning: assertion \"%s\" failed\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, assertion, + sdp->sd_fsname, function, file, line); + sdp->sd_last_warning = jiffies; return -1; diff --git a/fs/gfs2/eattr.c b/fs/gfs2/xattr.c index 07ea9529adda..8a0f8ef6ee27 100644 --- a/fs/gfs2/eattr.c +++ b/fs/gfs2/xattr.c @@ -18,8 +18,7 @@ #include "gfs2.h" #include "incore.h" #include "acl.h" -#include "eaops.h" -#include "eattr.h" +#include "xattr.h" #include "glock.h" #include "inode.h" #include "meta_io.h" @@ -38,26 +37,32 @@ * Returns: 1 if the EA should be stuffed */ -static int ea_calc_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er, +static int ea_calc_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize, unsigned int *size) { - *size = GFS2_EAREQ_SIZE_STUFFED(er); - if (*size <= sdp->sd_jbsize) + unsigned int jbsize = sdp->sd_jbsize; + + /* Stuffed */ + *size = ALIGN(sizeof(struct gfs2_ea_header) + nsize + dsize, 8); + + if (*size <= jbsize) return 1; - *size = GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er); + /* Unstuffed */ + *size = ALIGN(sizeof(struct gfs2_ea_header) + nsize + + (sizeof(__be64) * DIV_ROUND_UP(dsize, jbsize)), 8); return 0; } -static int ea_check_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er) +static int ea_check_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize) { unsigned int size; - if (er->er_data_len > GFS2_EA_MAX_DATA_LEN) + if (dsize > GFS2_EA_MAX_DATA_LEN) return -ERANGE; - ea_calc_size(sdp, er, &size); + ea_calc_size(sdp, nsize, dsize, &size); /* This can only happen with 512 byte blocks */ if (size > sdp->sd_jbsize) @@ -151,7 +156,9 @@ out: } struct ea_find { - struct gfs2_ea_request *ef_er; + int type; + const char *name; + size_t namel; struct gfs2_ea_location *ef_el; }; @@ -160,14 +167,13 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh, void *private) { struct ea_find *ef = private; - struct gfs2_ea_request *er = ef->ef_er; if (ea->ea_type == GFS2_EATYPE_UNUSED) return 0; - if (ea->ea_type == er->er_type) { - if (ea->ea_name_len == er->er_name_len && - !memcmp(GFS2_EA2NAME(ea), er->er_name, ea->ea_name_len)) { + if (ea->ea_type == ef->type) { + if (ea->ea_name_len == ef->namel && + !memcmp(GFS2_EA2NAME(ea), ef->name, ea->ea_name_len)) { struct gfs2_ea_location *el = ef->ef_el; get_bh(bh); el->el_bh = bh; @@ -180,13 +186,15 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh, return 0; } -int gfs2_ea_find(struct gfs2_inode *ip, struct gfs2_ea_request *er, +int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name, struct gfs2_ea_location *el) { struct ea_find ef; int error; - ef.ef_er = er; + ef.type = type; + ef.name = name; + ef.namel = strlen(name); ef.ef_el = el; memset(el, 0, sizeof(struct gfs2_ea_location)); @@ -344,6 +352,20 @@ struct ea_list { unsigned int ei_size; }; +static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea) +{ + switch (ea->ea_type) { + case GFS2_EATYPE_USR: + return 5 + ea->ea_name_len + 1; + case GFS2_EATYPE_SYS: + return 7 + ea->ea_name_len + 1; + case GFS2_EATYPE_SECURITY: + return 9 + ea->ea_name_len + 1; + default: + return 0; + } +} + static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh, struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, void *private) @@ -392,21 +414,25 @@ static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh, } /** - * gfs2_ea_list - - * @ip: - * @er: + * gfs2_listxattr - List gfs2 extended attributes + * @dentry: The dentry whose inode we are interested in + * @buffer: The buffer to write the results + * @size: The size of the buffer * * Returns: actual size of data on success, -errno on error */ -int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er) +ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size) { + struct gfs2_inode *ip = GFS2_I(dentry->d_inode); + struct gfs2_ea_request er; struct gfs2_holder i_gh; int error; - if (!er->er_data || !er->er_data_len) { - er->er_data = NULL; - er->er_data_len = 0; + memset(&er, 0, sizeof(struct gfs2_ea_request)); + if (size) { + er.er_data = buffer; + er.er_data_len = size; } error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); @@ -414,7 +440,7 @@ int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er) return error; if (ip->i_eattr) { - struct ea_list ei = { .ei_er = er, .ei_size = 0 }; + struct ea_list ei = { .ei_er = &er, .ei_size = 0 }; error = ea_foreach(ip, ea_list_i, &ei); if (!error) @@ -491,84 +517,61 @@ out: } int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el, - char *data) + char *data, size_t size) { + int ret; + size_t len = GFS2_EA_DATA_LEN(el->el_ea); + if (len > size) + return -ERANGE; + if (GFS2_EA_IS_STUFFED(el->el_ea)) { - memcpy(data, GFS2_EA2DATA(el->el_ea), GFS2_EA_DATA_LEN(el->el_ea)); - return 0; - } else - return ea_get_unstuffed(ip, el->el_ea, data); + memcpy(data, GFS2_EA2DATA(el->el_ea), len); + return len; + } + ret = ea_get_unstuffed(ip, el->el_ea, data); + if (ret < 0) + return ret; + return len; } /** - * gfs2_ea_get_i - - * @ip: The GFS2 inode - * @er: The request structure + * gfs2_xattr_get - Get a GFS2 extended attribute + * @inode: The inode + * @type: The type of extended attribute + * @name: The name of the extended attribute + * @buffer: The buffer to write the result into + * @size: The size of the buffer * * Returns: actual size of data on success, -errno on error */ -int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) +int gfs2_xattr_get(struct inode *inode, int type, const char *name, + void *buffer, size_t size) { + struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_ea_location el; int error; if (!ip->i_eattr) return -ENODATA; + if (strlen(name) > GFS2_EA_MAX_NAME_LEN) + return -EINVAL; - error = gfs2_ea_find(ip, er, &el); + error = gfs2_ea_find(ip, type, name, &el); if (error) return error; if (!el.el_ea) return -ENODATA; - - if (er->er_data_len) { - if (GFS2_EA_DATA_LEN(el.el_ea) > er->er_data_len) - error = -ERANGE; - else - error = gfs2_ea_get_copy(ip, &el, er->er_data); - } - if (!error) + if (size) + error = gfs2_ea_get_copy(ip, &el, buffer, size); + else error = GFS2_EA_DATA_LEN(el.el_ea); - brelse(el.el_bh); return error; } /** - * gfs2_ea_get - - * @ip: The GFS2 inode - * @er: The request structure - * - * Returns: actual size of data on success, -errno on error - */ - -int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er) -{ - struct gfs2_holder i_gh; - int error; - - if (!er->er_name_len || - er->er_name_len > GFS2_EA_MAX_NAME_LEN) - return -EINVAL; - if (!er->er_data || !er->er_data_len) { - er->er_data = NULL; - er->er_data_len = 0; - } - - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); - if (error) - return error; - - error = gfs2_ea_ops[er->er_type]->eo_get(ip, er); - - gfs2_glock_dq_uninit(&i_gh); - - return error; -} - -/** * ea_alloc_blk - allocates a new block for extended attributes. * @ip: A pointer to the inode that's getting extended attributes * @bhp: Pointer to pointer to a struct buffer_head @@ -713,12 +716,6 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { - if (er->er_flags & GFS2_ERF_MODE) { - gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), - (ip->i_inode.i_mode & S_IFMT) == - (er->er_mode & S_IFMT)); - ip->i_inode.i_mode = er->er_mode; - } ip->i_inode.i_ctime = CURRENT_TIME; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); @@ -762,15 +759,23 @@ static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er, * Returns: errno */ -static int ea_init(struct gfs2_inode *ip, struct gfs2_ea_request *er) +static int ea_init(struct gfs2_inode *ip, int type, const char *name, + const void *data, size_t size) { + struct gfs2_ea_request er; unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize; unsigned int blks = 1; - if (GFS2_EAREQ_SIZE_STUFFED(er) > jbsize) - blks += DIV_ROUND_UP(er->er_data_len, jbsize); + er.er_type = type; + er.er_name = name; + er.er_name_len = strlen(name); + er.er_data = (void *)data; + er.er_data_len = size; + + if (GFS2_EAREQ_SIZE_STUFFED(&er) > jbsize) + blks += DIV_ROUND_UP(er.er_data_len, jbsize); - return ea_alloc_skeleton(ip, er, blks, ea_init_i, NULL); + return ea_alloc_skeleton(ip, &er, blks, ea_init_i, NULL); } static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea) @@ -848,12 +853,6 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh, error = gfs2_meta_inode_buffer(ip, &dibh); if (error) goto out; - - if (er->er_flags & GFS2_ERF_MODE) { - gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), - (ip->i_inode.i_mode & S_IFMT) == (er->er_mode & S_IFMT)); - ip->i_inode.i_mode = er->er_mode; - } ip->i_inode.i_ctime = CURRENT_TIME; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); @@ -894,7 +893,8 @@ static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh, int stuffed; int error; - stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er, &size); + stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er->er_name_len, + es->es_er->er_data_len, &size); if (ea->ea_type == GFS2_EATYPE_UNUSED) { if (GFS2_EA_REC_LEN(ea) < size) @@ -1005,15 +1005,22 @@ out: return error; } -static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er, - struct gfs2_ea_location *el) +static int ea_set_i(struct gfs2_inode *ip, int type, const char *name, + const void *value, size_t size, struct gfs2_ea_location *el) { + struct gfs2_ea_request er; struct ea_set es; unsigned int blks = 2; int error; + er.er_type = type; + er.er_name = name; + er.er_data = (void *)value; + er.er_name_len = strlen(name); + er.er_data_len = size; + memset(&es, 0, sizeof(struct ea_set)); - es.es_er = er; + es.es_er = &er; es.es_el = el; error = ea_foreach(ip, ea_set_simple, &es); @@ -1024,10 +1031,10 @@ static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er, if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) blks++; - if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize) - blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize); + if (GFS2_EAREQ_SIZE_STUFFED(&er) > GFS2_SB(&ip->i_inode)->sd_jbsize) + blks += DIV_ROUND_UP(er.er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize); - return ea_alloc_skeleton(ip, er, blks, ea_set_block, el); + return ea_alloc_skeleton(ip, &er, blks, ea_set_block, el); } static int ea_set_remove_unstuffed(struct gfs2_inode *ip, @@ -1039,75 +1046,7 @@ static int ea_set_remove_unstuffed(struct gfs2_inode *ip, GFS2_EA2NEXT(el->el_prev) == el->el_ea); } - return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0); -} - -int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) -{ - struct gfs2_ea_location el; - int error; - - if (!ip->i_eattr) { - if (er->er_flags & XATTR_REPLACE) - return -ENODATA; - return ea_init(ip, er); - } - - error = gfs2_ea_find(ip, er, &el); - if (error) - return error; - - if (el.el_ea) { - if (ip->i_diskflags & GFS2_DIF_APPENDONLY) { - brelse(el.el_bh); - return -EPERM; - } - - error = -EEXIST; - if (!(er->er_flags & XATTR_CREATE)) { - int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea); - error = ea_set_i(ip, er, &el); - if (!error && unstuffed) - ea_set_remove_unstuffed(ip, &el); - } - - brelse(el.el_bh); - } else { - error = -ENODATA; - if (!(er->er_flags & XATTR_REPLACE)) - error = ea_set_i(ip, er, NULL); - } - - return error; -} - -int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er) -{ - struct gfs2_holder i_gh; - int error; - - if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN) - return -EINVAL; - if (!er->er_data || !er->er_data_len) { - er->er_data = NULL; - er->er_data_len = 0; - } - error = ea_check_size(GFS2_SB(&ip->i_inode), er); - if (error) - return error; - - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); - if (error) - return error; - - if (IS_IMMUTABLE(&ip->i_inode)) - error = -EPERM; - else - error = gfs2_ea_ops[er->er_type]->eo_set(ip, er); - - gfs2_glock_dq_uninit(&i_gh); - - return error; + return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev, 0); } static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) @@ -1131,8 +1070,9 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) if (GFS2_EA_IS_LAST(ea)) prev->ea_flags |= GFS2_EAFLAG_LAST; - } else + } else { ea->ea_type = GFS2_EATYPE_UNUSED; + } error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { @@ -1147,15 +1087,29 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) return error; } -int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) +/** + * gfs2_xattr_remove - Remove a GFS2 extended attribute + * @inode: The inode + * @type: The type of the extended attribute + * @name: The name of the extended attribute + * + * This is not called directly by the VFS since we use the (common) + * scheme of making a "set with NULL data" mean a remove request. Note + * that this is different from a set with zero length data. + * + * Returns: 0, or errno on failure + */ + +static int gfs2_xattr_remove(struct inode *inode, int type, const char *name) { + struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_ea_location el; int error; if (!ip->i_eattr) return -ENODATA; - error = gfs2_ea_find(ip, er, &el); + error = gfs2_ea_find(ip, type, name, &el); if (error) return error; if (!el.el_ea) @@ -1164,8 +1118,7 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) if (GFS2_EA_IS_STUFFED(el.el_ea)) error = ea_remove_stuffed(ip, &el); else - error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev, - 0); + error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev, 0); brelse(el.el_bh); @@ -1173,31 +1126,70 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) } /** - * gfs2_ea_remove - sets (or creates or replaces) an extended attribute - * @ip: pointer to the inode of the target file - * @er: request information + * gfs2_xattr_set - Set (or remove) a GFS2 extended attribute + * @inode: The inode + * @type: The type of the extended attribute + * @name: The name of the extended attribute + * @value: The value of the extended attribute (NULL for remove) + * @size: The size of the @value argument + * @flags: Create or Replace * - * Returns: errno + * See gfs2_xattr_remove() for details of the removal of xattrs. + * + * Returns: 0 or errno on failure */ -int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er) +int gfs2_xattr_set(struct inode *inode, int type, const char *name, + const void *value, size_t size, int flags) { - struct gfs2_holder i_gh; + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_ea_location el; + unsigned int namel = strlen(name); int error; - if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN) - return -EINVAL; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + return -EPERM; + if (namel > GFS2_EA_MAX_NAME_LEN) + return -ERANGE; - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); + if (value == NULL) + return gfs2_xattr_remove(inode, type, name); + + if (ea_check_size(sdp, namel, size)) + return -ERANGE; + + if (!ip->i_eattr) { + if (flags & XATTR_REPLACE) + return -ENODATA; + return ea_init(ip, type, name, value, size); + } + + error = gfs2_ea_find(ip, type, name, &el); if (error) return error; - if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode)) - error = -EPERM; - else - error = gfs2_ea_ops[er->er_type]->eo_remove(ip, er); + if (el.el_ea) { + if (ip->i_diskflags & GFS2_DIF_APPENDONLY) { + brelse(el.el_bh); + return -EPERM; + } - gfs2_glock_dq_uninit(&i_gh); + error = -EEXIST; + if (!(flags & XATTR_CREATE)) { + int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea); + error = ea_set_i(ip, type, name, value, size, &el); + if (!error && unstuffed) + ea_set_remove_unstuffed(ip, &el); + } + + brelse(el.el_bh); + return error; + } + + error = -ENODATA; + if (!(flags & XATTR_REPLACE)) + error = ea_set_i(ip, type, name, value, size, NULL); return error; } @@ -1503,3 +1495,64 @@ out_alloc: return error; } +static int gfs2_xattr_user_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + return gfs2_xattr_get(inode, GFS2_EATYPE_USR, name, buffer, size); +} + +static int gfs2_xattr_user_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + return gfs2_xattr_set(inode, GFS2_EATYPE_USR, name, value, size, flags); +} + +static int gfs2_xattr_system_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + return gfs2_xattr_get(inode, GFS2_EATYPE_SYS, name, buffer, size); +} + +static int gfs2_xattr_system_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + return gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, flags); +} + +static int gfs2_xattr_security_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + return gfs2_xattr_get(inode, GFS2_EATYPE_SECURITY, name, buffer, size); +} + +static int gfs2_xattr_security_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + return gfs2_xattr_set(inode, GFS2_EATYPE_SECURITY, name, value, size, flags); +} + +static struct xattr_handler gfs2_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .get = gfs2_xattr_user_get, + .set = gfs2_xattr_user_set, +}; + +static struct xattr_handler gfs2_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .get = gfs2_xattr_security_get, + .set = gfs2_xattr_security_set, +}; + +static struct xattr_handler gfs2_xattr_system_handler = { + .prefix = XATTR_SYSTEM_PREFIX, + .get = gfs2_xattr_system_get, + .set = gfs2_xattr_system_set, +}; + +struct xattr_handler *gfs2_xattr_handlers[] = { + &gfs2_xattr_user_handler, + &gfs2_xattr_security_handler, + &gfs2_xattr_system_handler, + NULL, +}; + diff --git a/fs/gfs2/eattr.h b/fs/gfs2/xattr.h index c82dbe01d713..cbdfd7743733 100644 --- a/fs/gfs2/eattr.h +++ b/fs/gfs2/xattr.h @@ -19,7 +19,7 @@ struct iattr; #define GFS2_EA_SIZE(ea) \ ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \ ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \ - (sizeof(__be64) * (ea)->ea_num_ptrs)), 8) + (sizeof(__be64) * (ea)->ea_num_ptrs)), 8) #define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs) #define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST) @@ -27,10 +27,6 @@ ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \ #define GFS2_EAREQ_SIZE_STUFFED(er) \ ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8) -#define GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er) \ -ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \ - sizeof(__be64) * DIV_ROUND_UP((er)->er_data_len, (sdp)->sd_jbsize), 8) - #define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1)) #define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len) @@ -43,16 +39,12 @@ ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \ #define GFS2_EA_BH2FIRST(bh) \ ((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header))) -#define GFS2_ERF_MODE 0x80000000 - struct gfs2_ea_request { const char *er_name; char *er_data; unsigned int er_name_len; unsigned int er_data_len; unsigned int er_type; /* GFS2_EATYPE_... */ - int er_flags; - mode_t er_mode; }; struct gfs2_ea_location { @@ -61,40 +53,20 @@ struct gfs2_ea_location { struct gfs2_ea_header *el_prev; }; -int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er); -int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er); -int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er); - -int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er); -int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er); -int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er); -int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er); - -int gfs2_ea_dealloc(struct gfs2_inode *ip); +extern int gfs2_xattr_get(struct inode *inode, int type, const char *name, + void *buffer, size_t size); +extern int gfs2_xattr_set(struct inode *inode, int type, const char *name, + const void *value, size_t size, int flags); +extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size); +extern int gfs2_ea_dealloc(struct gfs2_inode *ip); /* Exported to acl.c */ -int gfs2_ea_find(struct gfs2_inode *ip, - struct gfs2_ea_request *er, - struct gfs2_ea_location *el); -int gfs2_ea_get_copy(struct gfs2_inode *ip, - struct gfs2_ea_location *el, - char *data); -int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el, - struct iattr *attr, char *data); - -static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea) -{ - switch (ea->ea_type) { - case GFS2_EATYPE_USR: - return 5 + ea->ea_name_len + 1; - case GFS2_EATYPE_SYS: - return 7 + ea->ea_name_len + 1; - case GFS2_EATYPE_SECURITY: - return 9 + ea->ea_name_len + 1; - default: - return 0; - } -} +extern int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name, + struct gfs2_ea_location *el); +extern int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el, + char *data, size_t size); +extern int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el, + struct iattr *attr, char *data); #endif /* __EATTR_DOT_H__ */ diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 6f833dc8e910..f7fcbe49da72 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -19,6 +19,7 @@ #include <linux/nls.h> #include <linux/parser.h> #include <linux/seq_file.h> +#include <linux/smp_lock.h> #include <linux/vfs.h> #include "hfs_fs.h" diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 9fc3af0c0dab..c0759fe0855b 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -12,6 +12,7 @@ #include <linux/pagemap.h> #include <linux/fs.h> #include <linux/slab.h> +#include <linux/smp_lock.h> #include <linux/vfs.h> #include <linux/nls.h> diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index 6916c41d7017..8865c94f55f6 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -6,6 +6,7 @@ * directory VFS functions */ +#include <linux/smp_lock.h> #include "hpfs_fn.h" static int hpfs_dir_release(struct inode *inode, struct file *filp) diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 64ab52259204..3efabff00367 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -6,6 +6,7 @@ * file VFS functions */ +#include <linux/smp_lock.h> #include "hpfs_fn.h" #define BLOCKS(size) (((size) + 511) >> 9) diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index c2ea31bae313..701ca54c0867 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -13,7 +13,6 @@ #include <linux/pagemap.h> #include <linux/buffer_head.h> #include <linux/slab.h> -#include <linux/smp_lock.h> #include "hpfs.h" diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 39a1bfbea312..fe703ae46bc7 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -6,6 +6,7 @@ * inode VFS functions */ +#include <linux/smp_lock.h> #include "hpfs_fn.h" void hpfs_init_inode(struct inode *i) diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index b649232dde97..82b9c4ba9ed0 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -6,6 +6,7 @@ * adding & removing files & directories */ #include <linux/sched.h> +#include <linux/smp_lock.h> #include "hpfs_fn.h" static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 941c8425c10b..a93b885311d8 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -44,6 +44,7 @@ static const struct inode_operations hugetlbfs_dir_inode_operations; static const struct inode_operations hugetlbfs_inode_operations; static struct backing_dev_info hugetlbfs_backing_dev_info = { + .name = "hugetlbfs", .ra_pages = 0, /* No readahead */ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; @@ -935,26 +936,28 @@ static int can_do_hugetlb_shm(void) return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group); } -struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag) +struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, + struct user_struct **user) { int error = -ENOMEM; - int unlock_shm = 0; struct file *file; struct inode *inode; struct dentry *dentry, *root; struct qstr quick_string; - struct user_struct *user = current_user(); + *user = NULL; if (!hugetlbfs_vfsmount) return ERR_PTR(-ENOENT); if (!can_do_hugetlb_shm()) { - if (user_shm_lock(size, user)) { - unlock_shm = 1; + *user = current_user(); + if (user_shm_lock(size, *user)) { WARN_ONCE(1, "Using mlock ulimits for SHM_HUGETLB deprecated\n"); - } else + } else { + *user = NULL; return ERR_PTR(-EPERM); + } } root = hugetlbfs_vfsmount->mnt_root; @@ -996,8 +999,10 @@ out_inode: out_dentry: dput(dentry); out_shm_unlock: - if (unlock_shm) - user_shm_unlock(size, user); + if (*user) { + user_shm_unlock(size, *user); + *user = NULL; + } return ERR_PTR(error); } diff --git a/fs/inode.c b/fs/inode.c index 901bad1e5f12..b2ba83d2c4e1 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -120,12 +120,11 @@ static void wake_up_inode(struct inode *inode) * These are initializations that need to be done on every inode * allocation as the fields are not initialised by slab allocation. */ -struct inode *inode_init_always(struct super_block *sb, struct inode *inode) +int inode_init_always(struct super_block *sb, struct inode *inode) { static const struct address_space_operations empty_aops; static struct inode_operations empty_iops; static const struct file_operations empty_fops; - struct address_space *const mapping = &inode->i_data; inode->i_sb = sb; @@ -152,7 +151,7 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode) inode->dirtied_when = 0; if (security_inode_alloc(inode)) - goto out_free_inode; + goto out; /* allocate and initialize an i_integrity */ if (ima_inode_alloc(inode)) @@ -183,9 +182,7 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode) if (sb->s_bdev) { struct backing_dev_info *bdi; - bdi = sb->s_bdev->bd_inode_backing_dev_info; - if (!bdi) - bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; + bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; mapping->backing_dev_info = bdi; } inode->i_private = NULL; @@ -198,16 +195,12 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode) inode->i_fsnotify_mask = 0; #endif - return inode; + return 0; out_free_security: security_inode_free(inode); -out_free_inode: - if (inode->i_sb->s_op->destroy_inode) - inode->i_sb->s_op->destroy_inode(inode); - else - kmem_cache_free(inode_cachep, (inode)); - return NULL; +out: + return -ENOMEM; } EXPORT_SYMBOL(inode_init_always); @@ -220,12 +213,21 @@ static struct inode *alloc_inode(struct super_block *sb) else inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL); - if (inode) - return inode_init_always(sb, inode); - return NULL; + if (!inode) + return NULL; + + if (unlikely(inode_init_always(sb, inode))) { + if (inode->i_sb->s_op->destroy_inode) + inode->i_sb->s_op->destroy_inode(inode); + else + kmem_cache_free(inode_cachep, inode); + return NULL; + } + + return inode; } -void destroy_inode(struct inode *inode) +void __destroy_inode(struct inode *inode) { BUG_ON(inode_has_buffers(inode)); ima_inode_free(inode); @@ -237,13 +239,17 @@ void destroy_inode(struct inode *inode) if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED) posix_acl_release(inode->i_default_acl); #endif +} +EXPORT_SYMBOL(__destroy_inode); + +void destroy_inode(struct inode *inode) +{ + __destroy_inode(inode); if (inode->i_sb->s_op->destroy_inode) inode->i_sb->s_op->destroy_inode(inode); else kmem_cache_free(inode_cachep, (inode)); } -EXPORT_SYMBOL(destroy_inode); - /* * These are initializations that only need to be done diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 58a7963e168a..85f96bc651c7 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -142,6 +142,7 @@ static const struct dentry_operations isofs_dentry_ops[] = { struct iso9660_options{ unsigned int rock:1; + unsigned int joliet:1; unsigned int cruft:1; unsigned int hide:1; unsigned int showassoc:1; @@ -151,7 +152,6 @@ struct iso9660_options{ unsigned int gid_set:1; unsigned int utf8:1; unsigned char map; - char joliet; unsigned char check; unsigned int blocksize; mode_t fmode; @@ -632,7 +632,7 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent) else if (isonum_711(vdp->type) == ISO_VD_SUPPLEMENTARY) { sec = (struct iso_supplementary_descriptor *)vdp; if (sec->escape[0] == 0x25 && sec->escape[1] == 0x2f) { - if (opt.joliet == 'y') { + if (opt.joliet) { if (sec->escape[2] == 0x40) joliet_level = 1; else if (sec->escape[2] == 0x43) diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index 61f32f3868cd..b0435dd0654d 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -456,7 +456,7 @@ int cleanup_journal_tail(journal_t *journal) { transaction_t * transaction; tid_t first_tid; - unsigned long blocknr, freed; + unsigned int blocknr, freed; if (is_journal_aborted(journal)) return 1; @@ -502,8 +502,8 @@ int cleanup_journal_tail(journal_t *journal) freed = freed + journal->j_last - journal->j_first; jbd_debug(1, - "Cleaning journal tail from %d to %d (offset %lu), " - "freeing %lu\n", + "Cleaning journal tail from %d to %d (offset %u), " + "freeing %u\n", journal->j_tail_sequence, first_tid, blocknr, freed); journal->j_free += freed; diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 618e21c0b7a3..4bd882548c45 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -308,7 +308,7 @@ void journal_commit_transaction(journal_t *journal) int bufs; int flags; int err; - unsigned long blocknr; + unsigned int blocknr; ktime_t start_time; u64 commit_time; char *tagp = NULL; diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 737f7246a4b5..bd3c073b485d 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -276,7 +276,7 @@ static void journal_kill_thread(journal_t *journal) int journal_write_metadata_buffer(transaction_t *transaction, struct journal_head *jh_in, struct journal_head **jh_out, - unsigned long blocknr) + unsigned int blocknr) { int need_copy_out = 0; int done_copy_out = 0; @@ -287,6 +287,7 @@ int journal_write_metadata_buffer(transaction_t *transaction, struct page *new_page; unsigned int new_offset; struct buffer_head *bh_in = jh2bh(jh_in); + journal_t *journal = transaction->t_journal; /* * The buffer really shouldn't be locked: only the current committing @@ -300,6 +301,11 @@ int journal_write_metadata_buffer(transaction_t *transaction, J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); + /* keep subsequent assertions sane */ + new_bh->b_state = 0; + init_buffer(new_bh, NULL, NULL); + atomic_set(&new_bh->b_count, 1); + new_jh = journal_add_journal_head(new_bh); /* This sleeps */ /* * If a new transaction has already done a buffer copy-out, then @@ -361,14 +367,6 @@ repeat: kunmap_atomic(mapped_data, KM_USER0); } - /* keep subsequent assertions sane */ - new_bh->b_state = 0; - init_buffer(new_bh, NULL, NULL); - atomic_set(&new_bh->b_count, 1); - jbd_unlock_bh_state(bh_in); - - new_jh = journal_add_journal_head(new_bh); /* This sleeps */ - set_bh_page(new_bh, new_page, new_offset); new_jh->b_transaction = NULL; new_bh->b_size = jh2bh(jh_in)->b_size; @@ -385,7 +383,11 @@ repeat: * copying is moved to the transaction's shadow queue. */ JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); - journal_file_buffer(jh_in, transaction, BJ_Shadow); + spin_lock(&journal->j_list_lock); + __journal_file_buffer(jh_in, transaction, BJ_Shadow); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh_in); + JBUFFER_TRACE(new_jh, "file as BJ_IO"); journal_file_buffer(new_jh, transaction, BJ_IO); @@ -565,9 +567,9 @@ int log_wait_commit(journal_t *journal, tid_t tid) * Log buffer allocation routines: */ -int journal_next_log_block(journal_t *journal, unsigned long *retp) +int journal_next_log_block(journal_t *journal, unsigned int *retp) { - unsigned long blocknr; + unsigned int blocknr; spin_lock(&journal->j_state_lock); J_ASSERT(journal->j_free > 1); @@ -588,11 +590,11 @@ int journal_next_log_block(journal_t *journal, unsigned long *retp) * this is a no-op. If needed, we can use j_blk_offset - everything is * ready. */ -int journal_bmap(journal_t *journal, unsigned long blocknr, - unsigned long *retp) +int journal_bmap(journal_t *journal, unsigned int blocknr, + unsigned int *retp) { int err = 0; - unsigned long ret; + unsigned int ret; if (journal->j_inode) { ret = bmap(journal->j_inode, blocknr); @@ -602,7 +604,7 @@ int journal_bmap(journal_t *journal, unsigned long blocknr, char b[BDEVNAME_SIZE]; printk(KERN_ALERT "%s: journal block not found " - "at offset %lu on %s\n", + "at offset %u on %s\n", __func__, blocknr, bdevname(journal->j_dev, b)); @@ -628,7 +630,7 @@ int journal_bmap(journal_t *journal, unsigned long blocknr, struct journal_head *journal_get_descriptor_buffer(journal_t *journal) { struct buffer_head *bh; - unsigned long blocknr; + unsigned int blocknr; int err; err = journal_next_log_block(journal, &blocknr); @@ -772,7 +774,7 @@ journal_t * journal_init_inode (struct inode *inode) journal_t *journal = journal_init_common(); int err; int n; - unsigned long blocknr; + unsigned int blocknr; if (!journal) return NULL; @@ -844,10 +846,16 @@ static void journal_fail_superblock (journal_t *journal) static int journal_reset(journal_t *journal) { journal_superblock_t *sb = journal->j_superblock; - unsigned long first, last; + unsigned int first, last; first = be32_to_cpu(sb->s_first); last = be32_to_cpu(sb->s_maxlen); + if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) { + printk(KERN_ERR "JBD: Journal too short (blocks %u-%u).\n", + first, last); + journal_fail_superblock(journal); + return -EINVAL; + } journal->j_first = first; journal->j_last = last; @@ -877,7 +885,7 @@ static int journal_reset(journal_t *journal) **/ int journal_create(journal_t *journal) { - unsigned long blocknr; + unsigned int blocknr; struct buffer_head *bh; journal_superblock_t *sb; int i, err; @@ -961,14 +969,14 @@ void journal_update_superblock(journal_t *journal, int wait) if (sb->s_start == 0 && journal->j_tail_sequence == journal->j_transaction_sequence) { jbd_debug(1,"JBD: Skipping superblock update on recovered sb " - "(start %ld, seq %d, errno %d)\n", + "(start %u, seq %d, errno %d)\n", journal->j_tail, journal->j_tail_sequence, journal->j_errno); goto out; } spin_lock(&journal->j_state_lock); - jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", + jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n", journal->j_tail, journal->j_tail_sequence, journal->j_errno); sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); @@ -1363,7 +1371,7 @@ int journal_flush(journal_t *journal) { int err = 0; transaction_t *transaction = NULL; - unsigned long old_tail; + unsigned int old_tail; spin_lock(&journal->j_state_lock); diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c index db5e982c5ddf..cb1a49ae605e 100644 --- a/fs/jbd/recovery.c +++ b/fs/jbd/recovery.c @@ -70,7 +70,7 @@ static int do_readahead(journal_t *journal, unsigned int start) { int err; unsigned int max, nbufs, next; - unsigned long blocknr; + unsigned int blocknr; struct buffer_head *bh; struct buffer_head * bufs[MAXBUF]; @@ -132,7 +132,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal, unsigned int offset) { int err; - unsigned long blocknr; + unsigned int blocknr; struct buffer_head *bh; *bhp = NULL; @@ -314,7 +314,7 @@ static int do_one_pass(journal_t *journal, struct recovery_info *info, enum passtype pass) { unsigned int first_commit_ID, next_commit_ID; - unsigned long next_log_block; + unsigned int next_log_block; int err, success = 0; journal_superblock_t * sb; journal_header_t * tmp; @@ -367,14 +367,14 @@ static int do_one_pass(journal_t *journal, if (tid_geq(next_commit_ID, info->end_transaction)) break; - jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n", + jbd_debug(2, "Scanning for sequence ID %u at %u/%u\n", next_commit_ID, next_log_block, journal->j_last); /* Skip over each chunk of the transaction looking * either the next descriptor block or the final commit * record. */ - jbd_debug(3, "JBD: checking block %ld\n", next_log_block); + jbd_debug(3, "JBD: checking block %u\n", next_log_block); err = jread(&bh, journal, next_log_block); if (err) goto failed; @@ -429,7 +429,7 @@ static int do_one_pass(journal_t *journal, tagp = &bh->b_data[sizeof(journal_header_t)]; while ((tagp - bh->b_data +sizeof(journal_block_tag_t)) <= journal->j_blocksize) { - unsigned long io_block; + unsigned int io_block; tag = (journal_block_tag_t *) tagp; flags = be32_to_cpu(tag->t_flags); @@ -443,10 +443,10 @@ static int do_one_pass(journal_t *journal, success = err; printk (KERN_ERR "JBD: IO error %d recovering " - "block %ld in log\n", + "block %u in log\n", err, io_block); } else { - unsigned long blocknr; + unsigned int blocknr; J_ASSERT(obh != NULL); blocknr = be32_to_cpu(tag->t_blocknr); @@ -581,7 +581,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, max = be32_to_cpu(header->r_count); while (offset < max) { - unsigned long blocknr; + unsigned int blocknr; int err; blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset))); diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c index da6cd9bdaabc..ad717328343a 100644 --- a/fs/jbd/revoke.c +++ b/fs/jbd/revoke.c @@ -101,7 +101,7 @@ struct jbd_revoke_record_s { struct list_head hash; tid_t sequence; /* Used for recovery only */ - unsigned long blocknr; + unsigned int blocknr; }; @@ -126,7 +126,7 @@ static void flush_descriptor(journal_t *, struct journal_head *, int, int); /* Utility functions to maintain the revoke table */ /* Borrowed from buffer.c: this is a tried and tested block hash function */ -static inline int hash(journal_t *journal, unsigned long block) +static inline int hash(journal_t *journal, unsigned int block) { struct jbd_revoke_table_s *table = journal->j_revoke; int hash_shift = table->hash_shift; @@ -136,7 +136,7 @@ static inline int hash(journal_t *journal, unsigned long block) (block << (hash_shift - 12))) & (table->hash_size - 1); } -static int insert_revoke_hash(journal_t *journal, unsigned long blocknr, +static int insert_revoke_hash(journal_t *journal, unsigned int blocknr, tid_t seq) { struct list_head *hash_list; @@ -166,7 +166,7 @@ oom: /* Find a revoke record in the journal's hash table. */ static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal, - unsigned long blocknr) + unsigned int blocknr) { struct list_head *hash_list; struct jbd_revoke_record_s *record; @@ -332,7 +332,7 @@ void journal_destroy_revoke(journal_t *journal) * by one. */ -int journal_revoke(handle_t *handle, unsigned long blocknr, +int journal_revoke(handle_t *handle, unsigned int blocknr, struct buffer_head *bh_in) { struct buffer_head *bh = NULL; @@ -401,7 +401,7 @@ int journal_revoke(handle_t *handle, unsigned long blocknr, } } - jbd_debug(2, "insert revoke for block %lu, bh_in=%p\n", blocknr, bh_in); + jbd_debug(2, "insert revoke for block %u, bh_in=%p\n", blocknr, bh_in); err = insert_revoke_hash(journal, blocknr, handle->h_transaction->t_tid); BUFFER_TRACE(bh_in, "exit"); @@ -644,7 +644,7 @@ static void flush_descriptor(journal_t *journal, */ int journal_set_revoke(journal_t *journal, - unsigned long blocknr, + unsigned int blocknr, tid_t sequence) { struct jbd_revoke_record_s *record; @@ -668,7 +668,7 @@ int journal_set_revoke(journal_t *journal, */ int journal_test_revoke(journal_t *journal, - unsigned long blocknr, + unsigned int blocknr, tid_t sequence) { struct jbd_revoke_record_s *record; diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 73242ba7c7b1..006f9ad838a2 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -56,7 +56,8 @@ get_transaction(journal_t *journal, transaction_t *transaction) spin_lock_init(&transaction->t_handle_lock); /* Set up the commit timer for the new transaction. */ - journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); + journal->j_commit_timer.expires = + round_jiffies_up(transaction->t_expires); add_timer(&journal->j_commit_timer); J_ASSERT(journal->j_running_transaction == NULL); @@ -228,6 +229,8 @@ repeat_locked: __log_space_left(journal)); spin_unlock(&transaction->t_handle_lock); spin_unlock(&journal->j_state_lock); + + lock_map_acquire(&handle->h_lockdep_map); out: if (unlikely(new_transaction)) /* It's usually NULL */ kfree(new_transaction); @@ -292,9 +295,6 @@ handle_t *journal_start(journal_t *journal, int nblocks) handle = ERR_PTR(err); goto out; } - - lock_map_acquire(&handle->h_lockdep_map); - out: return handle; } @@ -416,6 +416,7 @@ int journal_restart(handle_t *handle, int nblocks) __log_start_commit(journal, transaction->t_tid); spin_unlock(&journal->j_state_lock); + lock_map_release(&handle->h_lockdep_map); handle->h_buffer_credits = nblocks; ret = start_this_handle(journal, handle); return ret; @@ -489,34 +490,15 @@ void journal_unlock_updates (journal_t *journal) wake_up(&journal->j_wait_transaction_locked); } -/* - * Report any unexpected dirty buffers which turn up. Normally those - * indicate an error, but they can occur if the user is running (say) - * tune2fs to modify the live filesystem, so we need the option of - * continuing as gracefully as possible. # - * - * The caller should already hold the journal lock and - * j_list_lock spinlock: most callers will need those anyway - * in order to probe the buffer's journaling state safely. - */ -static void jbd_unexpected_dirty_buffer(struct journal_head *jh) +static void warn_dirty_buffer(struct buffer_head *bh) { - int jlist; - - /* If this buffer is one which might reasonably be dirty - * --- ie. data, or not part of this journal --- then - * we're OK to leave it alone, but otherwise we need to - * move the dirty bit to the journal's own internal - * JBDDirty bit. */ - jlist = jh->b_jlist; - - if (jlist == BJ_Metadata || jlist == BJ_Reserved || - jlist == BJ_Shadow || jlist == BJ_Forget) { - struct buffer_head *bh = jh2bh(jh); + char b[BDEVNAME_SIZE]; - if (test_clear_buffer_dirty(bh)) - set_buffer_jbddirty(bh); - } + printk(KERN_WARNING + "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). " + "There's a risk of filesystem corruption in case of system " + "crash.\n", + bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); } /* @@ -583,14 +565,16 @@ repeat: if (jh->b_next_transaction) J_ASSERT_JH(jh, jh->b_next_transaction == transaction); + warn_dirty_buffer(bh); } /* * In any case we need to clean the dirty flag and we must * do it under the buffer lock to be sure we don't race * with running write-out. */ - JBUFFER_TRACE(jh, "Unexpected dirty buffer"); - jbd_unexpected_dirty_buffer(jh); + JBUFFER_TRACE(jh, "Journalling dirty buffer"); + clear_buffer_dirty(bh); + set_buffer_jbddirty(bh); } unlock_buffer(bh); @@ -826,6 +810,15 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh) J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); if (jh->b_transaction == NULL) { + /* + * Previous journal_forget() could have left the buffer + * with jbddirty bit set because it was being committed. When + * the commit finished, we've filed the buffer for + * checkpointing and marked it dirty. Now we are reallocating + * the buffer so the transaction freeing it must have + * committed and so it's safe to clear the dirty bit. + */ + clear_buffer_dirty(jh2bh(jh)); jh->b_transaction = transaction; /* first access by this transaction */ @@ -1782,8 +1775,13 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) if (jh->b_cp_transaction) { JBUFFER_TRACE(jh, "on running+cp transaction"); + /* + * We don't want to write the buffer anymore, clear the + * bit so that we don't confuse checks in + * __journal_file_buffer + */ + clear_buffer_dirty(bh); __journal_file_buffer(jh, transaction, BJ_Forget); - clear_buffer_jbddirty(bh); may_free = 0; } else { JBUFFER_TRACE(jh, "on running transaction"); @@ -2041,12 +2039,17 @@ void __journal_file_buffer(struct journal_head *jh, if (jh->b_transaction && jh->b_jlist == jlist) return; - /* The following list of buffer states needs to be consistent - * with __jbd_unexpected_dirty_buffer()'s handling of dirty - * state. */ - if (jlist == BJ_Metadata || jlist == BJ_Reserved || jlist == BJ_Shadow || jlist == BJ_Forget) { + /* + * For metadata buffers, we track dirty bit in buffer_jbddirty + * instead of buffer_dirty. We should not see a dirty bit set + * here because we clear it in do_get_write_access but e.g. + * tune2fs can modify the sb and set the dirty bit at any time + * so we try to gracefully handle that. + */ + if (buffer_dirty(bh)) + warn_dirty_buffer(bh); if (test_clear_buffer_dirty(bh) || test_clear_buffer_jbddirty(bh)) was_dirty = 1; diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 7b4088b2364d..0df600e9162d 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -220,7 +220,6 @@ static int journal_submit_inode_data_buffers(struct address_space *mapping) .nr_to_write = mapping->nrpages * 2, .range_start = 0, .range_end = i_size_read(mapping->host), - .for_writepages = 1, }; ret = generic_writepages(mapping, &wbc); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 18bfd5dab642..e378cb383979 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -297,6 +297,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, unsigned int new_offset; struct buffer_head *bh_in = jh2bh(jh_in); struct jbd2_buffer_trigger_type *triggers; + journal_t *journal = transaction->t_journal; /* * The buffer really shouldn't be locked: only the current committing @@ -310,6 +311,11 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); + /* keep subsequent assertions sane */ + new_bh->b_state = 0; + init_buffer(new_bh, NULL, NULL); + atomic_set(&new_bh->b_count, 1); + new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */ /* * If a new transaction has already done a buffer copy-out, then @@ -388,14 +394,6 @@ repeat: kunmap_atomic(mapped_data, KM_USER0); } - /* keep subsequent assertions sane */ - new_bh->b_state = 0; - init_buffer(new_bh, NULL, NULL); - atomic_set(&new_bh->b_count, 1); - jbd_unlock_bh_state(bh_in); - - new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */ - set_bh_page(new_bh, new_page, new_offset); new_jh->b_transaction = NULL; new_bh->b_size = jh2bh(jh_in)->b_size; @@ -412,7 +410,11 @@ repeat: * copying is moved to the transaction's shadow queue. */ JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); - jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow); + spin_lock(&journal->j_list_lock); + __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh_in); + JBUFFER_TRACE(new_jh, "file as BJ_IO"); jbd2_journal_file_buffer(new_jh, transaction, BJ_IO); @@ -2410,6 +2412,7 @@ const char *jbd2_dev_to_name(dev_t device) int i = hash_32(device, CACHE_SIZE_BITS); char *ret; struct block_device *bd; + static struct devname_cache *new_dev; rcu_read_lock(); if (devcache[i] && devcache[i]->device == device) { @@ -2419,20 +2422,20 @@ const char *jbd2_dev_to_name(dev_t device) } rcu_read_unlock(); + new_dev = kmalloc(sizeof(struct devname_cache), GFP_KERNEL); + if (!new_dev) + return "NODEV-ALLOCFAILURE"; /* Something non-NULL */ spin_lock(&devname_cache_lock); if (devcache[i]) { if (devcache[i]->device == device) { + kfree(new_dev); ret = devcache[i]->devname; spin_unlock(&devname_cache_lock); return ret; } call_rcu(&devcache[i]->rcu, free_devcache); } - devcache[i] = kmalloc(sizeof(struct devname_cache), GFP_KERNEL); - if (!devcache[i]) { - spin_unlock(&devname_cache_lock); - return "NODEV-ALLOCFAILURE"; /* Something non-NULL */ - } + devcache[i] = new_dev; devcache[i]->device = device; bd = bdget(device); if (bd) { diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 494501edba6b..6213ac728f30 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -499,34 +499,15 @@ void jbd2_journal_unlock_updates (journal_t *journal) wake_up(&journal->j_wait_transaction_locked); } -/* - * Report any unexpected dirty buffers which turn up. Normally those - * indicate an error, but they can occur if the user is running (say) - * tune2fs to modify the live filesystem, so we need the option of - * continuing as gracefully as possible. # - * - * The caller should already hold the journal lock and - * j_list_lock spinlock: most callers will need those anyway - * in order to probe the buffer's journaling state safely. - */ -static void jbd_unexpected_dirty_buffer(struct journal_head *jh) +static void warn_dirty_buffer(struct buffer_head *bh) { - int jlist; - - /* If this buffer is one which might reasonably be dirty - * --- ie. data, or not part of this journal --- then - * we're OK to leave it alone, but otherwise we need to - * move the dirty bit to the journal's own internal - * JBDDirty bit. */ - jlist = jh->b_jlist; + char b[BDEVNAME_SIZE]; - if (jlist == BJ_Metadata || jlist == BJ_Reserved || - jlist == BJ_Shadow || jlist == BJ_Forget) { - struct buffer_head *bh = jh2bh(jh); - - if (test_clear_buffer_dirty(bh)) - set_buffer_jbddirty(bh); - } + printk(KERN_WARNING + "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). " + "There's a risk of filesystem corruption in case of system " + "crash.\n", + bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); } /* @@ -593,14 +574,16 @@ repeat: if (jh->b_next_transaction) J_ASSERT_JH(jh, jh->b_next_transaction == transaction); + warn_dirty_buffer(bh); } /* * In any case we need to clean the dirty flag and we must * do it under the buffer lock to be sure we don't race * with running write-out. */ - JBUFFER_TRACE(jh, "Unexpected dirty buffer"); - jbd_unexpected_dirty_buffer(jh); + JBUFFER_TRACE(jh, "Journalling dirty buffer"); + clear_buffer_dirty(bh); + set_buffer_jbddirty(bh); } unlock_buffer(bh); @@ -843,6 +826,15 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); if (jh->b_transaction == NULL) { + /* + * Previous jbd2_journal_forget() could have left the buffer + * with jbddirty bit set because it was being committed. When + * the commit finished, we've filed the buffer for + * checkpointing and marked it dirty. Now we are reallocating + * the buffer so the transaction freeing it must have + * committed and so it's safe to clear the dirty bit. + */ + clear_buffer_dirty(jh2bh(jh)); jh->b_transaction = transaction; /* first access by this transaction */ @@ -1644,8 +1636,13 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) if (jh->b_cp_transaction) { JBUFFER_TRACE(jh, "on running+cp transaction"); + /* + * We don't want to write the buffer anymore, clear the + * bit so that we don't confuse checks in + * __journal_file_buffer + */ + clear_buffer_dirty(bh); __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); - clear_buffer_jbddirty(bh); may_free = 0; } else { JBUFFER_TRACE(jh, "on running transaction"); @@ -1896,12 +1893,17 @@ void __jbd2_journal_file_buffer(struct journal_head *jh, if (jh->b_transaction && jh->b_jlist == jlist) return; - /* The following list of buffer states needs to be consistent - * with __jbd_unexpected_dirty_buffer()'s handling of dirty - * state. */ - if (jlist == BJ_Metadata || jlist == BJ_Reserved || jlist == BJ_Shadow || jlist == BJ_Forget) { + /* + * For metadata buffers, we track dirty bit in buffer_jbddirty + * instead of buffer_dirty. We should not see a dirty bit set + * here because we clear it in do_get_write_access but e.g. + * tune2fs can modify the sb and set the dirty bit at any time + * so we try to gracefully handle that. + */ + if (buffer_dirty(bh)) + warn_dirty_buffer(bh); if (test_clear_buffer_dirty(bh) || test_clear_buffer_jbddirty(bh)) was_dirty = 1; diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 8fcb6239218e..7edb62e97419 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -258,7 +258,7 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl) return rc; } -static int jffs2_check_acl(struct inode *inode, int mask) +int jffs2_check_acl(struct inode *inode, int mask) { struct posix_acl *acl; int rc; @@ -274,11 +274,6 @@ static int jffs2_check_acl(struct inode *inode, int mask) return -EAGAIN; } -int jffs2_permission(struct inode *inode, int mask) -{ - return generic_permission(inode, mask, jffs2_check_acl); -} - int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode) { struct posix_acl *acl, *clone; diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h index fc929f2a14f6..f0ba63e3c36b 100644 --- a/fs/jffs2/acl.h +++ b/fs/jffs2/acl.h @@ -26,7 +26,7 @@ struct jffs2_acl_header { #ifdef CONFIG_JFFS2_FS_POSIX_ACL -extern int jffs2_permission(struct inode *, int); +extern int jffs2_check_acl(struct inode *, int); extern int jffs2_acl_chmod(struct inode *); extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *); extern int jffs2_init_acl_post(struct inode *); @@ -36,7 +36,7 @@ extern struct xattr_handler jffs2_acl_default_xattr_handler; #else -#define jffs2_permission (NULL) +#define jffs2_check_acl (NULL) #define jffs2_acl_chmod(inode) (0) #define jffs2_init_acl_pre(dir_i,inode,mode) (0) #define jffs2_init_acl_post(inode) (0) diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 6f60cc910f4c..7aa4417e085f 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -55,7 +55,7 @@ const struct inode_operations jffs2_dir_inode_operations = .rmdir = jffs2_rmdir, .mknod = jffs2_mknod, .rename = jffs2_rename, - .permission = jffs2_permission, + .check_acl = jffs2_check_acl, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index a0244740b75a..b47679be118a 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c @@ -270,19 +270,21 @@ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c, D2({ int i=0; struct jffs2_raw_node_ref *this; - printk(KERN_DEBUG "After remove_node_refs_from_ino_list: \n" KERN_DEBUG); + printk(KERN_DEBUG "After remove_node_refs_from_ino_list: \n"); this = ic->nodes; + printk(KERN_DEBUG); while(this) { - printk( "0x%08x(%d)->", ref_offset(this), ref_flags(this)); + printk(KERN_CONT "0x%08x(%d)->", + ref_offset(this), ref_flags(this)); if (++i == 5) { - printk("\n" KERN_DEBUG); + printk(KERN_DEBUG); i=0; } this = this->next_in_ino; } - printk("\n"); + printk(KERN_CONT "\n"); }); switch (ic->class) { diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 5edc2bf20581..b7b74e299142 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -56,7 +56,7 @@ const struct file_operations jffs2_file_operations = const struct inode_operations jffs2_file_inode_operations = { - .permission = jffs2_permission, + .check_acl = jffs2_check_acl, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, @@ -99,7 +99,7 @@ static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg) kunmap(pg); D2(printk(KERN_DEBUG "readpage finished\n")); - return 0; + return ret; } int jffs2_do_readpage_unlock(struct inode *inode, struct page *pg) diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 07a22caf2687..0035c021395a 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -12,6 +12,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> +#include <linux/smp_lock.h> #include <linux/init.h> #include <linux/list.h> #include <linux/fs.h> diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c index b7339c3b6ad9..4ec11e8bda8c 100644 --- a/fs/jffs2/symlink.c +++ b/fs/jffs2/symlink.c @@ -21,7 +21,7 @@ const struct inode_operations jffs2_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = jffs2_follow_link, - .permission = jffs2_permission, + .check_acl = jffs2_check_acl, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index d9a721e6db70..5ef7bac265e5 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -1268,10 +1268,20 @@ int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) { if (!c->wbuf) return -ENOMEM; +#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY + c->wbuf_verify = kmalloc(c->wbuf_pagesize, GFP_KERNEL); + if (!c->wbuf_verify) { + kfree(c->wbuf); + return -ENOMEM; + } +#endif return 0; } void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c) { +#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY + kfree(c->wbuf_verify); +#endif kfree(c->wbuf); } diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 91fa3ad6e8c2..d66477c34306 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -67,10 +67,8 @@ static struct posix_acl *jfs_get_acl(struct inode *inode, int type) acl = posix_acl_from_xattr(value, size); } kfree(value); - if (!IS_ERR(acl)) { + if (!IS_ERR(acl)) set_cached_acl(inode, type, acl); - posix_acl_release(acl); - } return acl; } @@ -116,7 +114,7 @@ out: return rc; } -static int jfs_check_acl(struct inode *inode, int mask) +int jfs_check_acl(struct inode *inode, int mask) { struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS); @@ -131,11 +129,6 @@ static int jfs_check_acl(struct inode *inode, int mask) return -EAGAIN; } -int jfs_permission(struct inode *inode, int mask) -{ - return generic_permission(inode, mask, jfs_check_acl); -} - int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir) { struct posix_acl *acl = NULL; diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 7f6063acaa3b..2b70fa78e4a7 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -96,7 +96,7 @@ const struct inode_operations jfs_file_inode_operations = { .removexattr = jfs_removexattr, #ifdef CONFIG_JFS_POSIX_ACL .setattr = jfs_setattr, - .permission = jfs_permission, + .check_acl = jfs_check_acl, #endif }; diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h index 88475f10a389..b07bd417ef85 100644 --- a/fs/jfs/jfs_acl.h +++ b/fs/jfs/jfs_acl.h @@ -20,7 +20,7 @@ #ifdef CONFIG_JFS_POSIX_ACL -int jfs_permission(struct inode *, int); +int jfs_check_acl(struct inode *, int); int jfs_init_acl(tid_t, struct inode *, struct inode *); int jfs_setattr(struct dentry *, struct iattr *); diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 514ee2edb92a..c79a4270f083 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -1543,7 +1543,7 @@ const struct inode_operations jfs_dir_inode_operations = { .removexattr = jfs_removexattr, #ifdef CONFIG_JFS_POSIX_ACL .setattr = jfs_setattr, - .permission = jfs_permission, + .check_acl = jfs_check_acl, #endif }; diff --git a/fs/libfs.c b/fs/libfs.c index ddfa89948c3f..dcec3d3ea64f 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -217,7 +217,7 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name, return PTR_ERR(s); s->s_flags = MS_NOUSER; - s->s_maxbytes = ~0ULL; + s->s_maxbytes = MAX_LFS_FILESIZE; s->s_blocksize = PAGE_SIZE; s->s_blocksize_bits = PAGE_SHIFT; s->s_magic = magic; diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index f2fdcbce143e..4336adba952a 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -7,6 +7,7 @@ */ #include <linux/module.h> +#include <linux/smp_lock.h> #include <linux/types.h> #include <linux/errno.h> #include <linux/fs.h> diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 99d737bd4325..7cb076ac6b45 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -87,18 +87,6 @@ static unsigned int nlm_hash_address(const struct sockaddr *sap) return hash & (NLM_HOST_NRHASH - 1); } -static void nlm_clear_port(struct sockaddr *sap) -{ - switch (sap->sa_family) { - case AF_INET: - ((struct sockaddr_in *)sap)->sin_port = 0; - break; - case AF_INET6: - ((struct sockaddr_in6 *)sap)->sin6_port = 0; - break; - } -} - /* * Common host lookup routine for server & client */ @@ -177,7 +165,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) host->h_addrbuf = nsm->sm_addrbuf; memcpy(nlm_addr(host), ni->sap, ni->salen); host->h_addrlen = ni->salen; - nlm_clear_port(nlm_addr(host)); + rpc_set_port(nlm_addr(host), 0); memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len); host->h_version = ni->version; host->h_proto = ni->protocol; diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 7fce1b525849..30c933188dd7 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -61,43 +61,6 @@ static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm) return (struct sockaddr *)&nsm->sm_addr; } -static void nsm_display_ipv4_address(const struct sockaddr *sap, char *buf, - const size_t len) -{ - const struct sockaddr_in *sin = (struct sockaddr_in *)sap; - snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr); -} - -static void nsm_display_ipv6_address(const struct sockaddr *sap, char *buf, - const size_t len) -{ - const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; - - if (ipv6_addr_v4mapped(&sin6->sin6_addr)) - snprintf(buf, len, "%pI4", &sin6->sin6_addr.s6_addr32[3]); - else if (sin6->sin6_scope_id != 0) - snprintf(buf, len, "%pI6%%%u", &sin6->sin6_addr, - sin6->sin6_scope_id); - else - snprintf(buf, len, "%pI6", &sin6->sin6_addr); -} - -static void nsm_display_address(const struct sockaddr *sap, - char *buf, const size_t len) -{ - switch (sap->sa_family) { - case AF_INET: - nsm_display_ipv4_address(sap, buf, len); - break; - case AF_INET6: - nsm_display_ipv6_address(sap, buf, len); - break; - default: - snprintf(buf, len, "unsupported address family"); - break; - } -} - static struct rpc_clnt *nsm_create(void) { struct sockaddr_in sin = { @@ -307,8 +270,11 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap, memcpy(nsm_addr(new), sap, salen); new->sm_addrlen = salen; nsm_init_private(new); - nsm_display_address((const struct sockaddr *)&new->sm_addr, - new->sm_addrbuf, sizeof(new->sm_addrbuf)); + + if (rpc_ntop(nsm_addr(new), new->sm_addrbuf, + sizeof(new->sm_addrbuf)) == 0) + (void)snprintf(new->sm_addrbuf, sizeof(new->sm_addrbuf), + "unsupported address family"); memcpy(new->sm_name, hostname, hostname_len); new->sm_name[hostname_len] = '\0'; diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 1725037374c5..bd173a6ca3b1 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -10,6 +10,7 @@ #include <linux/types.h> #include <linux/time.h> #include <linux/slab.h> +#include <linux/smp_lock.h> #include <linux/in.h> #include <linux/sunrpc/svc.h> #include <linux/sunrpc/clnt.h> diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 3688e55901fc..e1d28ddd2169 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -10,6 +10,7 @@ #include <linux/types.h> #include <linux/time.h> #include <linux/slab.h> +#include <linux/smp_lock.h> #include <linux/in.h> #include <linux/sunrpc/svc.h> #include <linux/sunrpc/clnt.h> diff --git a/fs/locks.c b/fs/locks.c index b6440f52178f..19ee18a6829b 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -768,7 +768,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) * give it the opportunity to lock the file. */ if (found) - cond_resched_bkl(); + cond_resched(); find_conflict: for_each_lock(inode, before) { @@ -1591,7 +1591,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) if (can_sleep) lock->fl_flags |= FL_SLEEP; - error = security_file_lock(filp, cmd); + error = security_file_lock(filp, lock->fl_type); if (error) goto out_free; diff --git a/fs/namei.c b/fs/namei.c index f3c5b278895a..d11f404667e9 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -169,19 +169,10 @@ void putname(const char *name) EXPORT_SYMBOL(putname); #endif - -/** - * generic_permission - check for access rights on a Posix-like filesystem - * @inode: inode to check access rights for - * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) - * @check_acl: optional callback to check for Posix ACLs - * - * Used to check for read/write/execute permissions on a file. - * We use "fsuid" for this, letting us set arbitrary permissions - * for filesystem access without changing the "normal" uids which - * are used for other things.. +/* + * This does basic POSIX ACL permission checking */ -int generic_permission(struct inode *inode, int mask, +static int acl_permission_check(struct inode *inode, int mask, int (*check_acl)(struct inode *inode, int mask)) { umode_t mode = inode->i_mode; @@ -193,9 +184,7 @@ int generic_permission(struct inode *inode, int mask, else { if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) { int error = check_acl(inode, mask); - if (error == -EACCES) - goto check_capabilities; - else if (error != -EAGAIN) + if (error != -EAGAIN) return error; } @@ -208,8 +197,32 @@ int generic_permission(struct inode *inode, int mask, */ if ((mask & ~mode) == 0) return 0; + return -EACCES; +} + +/** + * generic_permission - check for access rights on a Posix-like filesystem + * @inode: inode to check access rights for + * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) + * @check_acl: optional callback to check for Posix ACLs + * + * Used to check for read/write/execute permissions on a file. + * We use "fsuid" for this, letting us set arbitrary permissions + * for filesystem access without changing the "normal" uids which + * are used for other things.. + */ +int generic_permission(struct inode *inode, int mask, + int (*check_acl)(struct inode *inode, int mask)) +{ + int ret; + + /* + * Do the basic POSIX ACL permission checks. + */ + ret = acl_permission_check(inode, mask, check_acl); + if (ret != -EACCES) + return ret; - check_capabilities: /* * Read/write DACs are always overridable. * Executable DACs are overridable if at least one exec bit is set. @@ -262,7 +275,7 @@ int inode_permission(struct inode *inode, int mask) if (inode->i_op->permission) retval = inode->i_op->permission(inode, mask); else - retval = generic_permission(inode, mask, NULL); + retval = generic_permission(inode, mask, inode->i_op->check_acl); if (retval) return retval; @@ -432,29 +445,22 @@ static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, */ static int exec_permission_lite(struct inode *inode) { - umode_t mode = inode->i_mode; + int ret; - if (inode->i_op->permission) - return -EAGAIN; - - if (current_fsuid() == inode->i_uid) - mode >>= 6; - else if (in_group_p(inode->i_gid)) - mode >>= 3; - - if (mode & MAY_EXEC) - goto ok; - - if ((inode->i_mode & S_IXUGO) && capable(CAP_DAC_OVERRIDE)) - goto ok; - - if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_OVERRIDE)) + if (inode->i_op->permission) { + ret = inode->i_op->permission(inode, MAY_EXEC); + if (!ret) + goto ok; + return ret; + } + ret = acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl); + if (!ret) goto ok; - if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_READ_SEARCH)) + if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH)) goto ok; - return -EACCES; + return ret; ok: return security_inode_permission(inode, MAY_EXEC); } @@ -853,12 +859,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd) nd->flags |= LOOKUP_CONTINUE; err = exec_permission_lite(inode); - if (err == -EAGAIN) - err = inode_permission(nd->path.dentry->d_inode, - MAY_EXEC); - if (!err) - err = ima_path_check(&nd->path, MAY_EXEC, - IMA_COUNT_UPDATE); if (err) break; @@ -1533,37 +1533,42 @@ int may_open(struct path *path, int acc_mode, int flag) if (error) return error; - error = ima_path_check(path, - acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC), + error = ima_path_check(path, acc_mode ? + acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) : + ACC_MODE(flag) & (MAY_READ | MAY_WRITE), IMA_COUNT_UPDATE); + if (error) return error; /* * An append-only file must be opened in append mode for writing. */ if (IS_APPEND(inode)) { + error = -EPERM; if ((flag & FMODE_WRITE) && !(flag & O_APPEND)) - return -EPERM; + goto err_out; if (flag & O_TRUNC) - return -EPERM; + goto err_out; } /* O_NOATIME can only be set by the owner or superuser */ if (flag & O_NOATIME) - if (!is_owner_or_cap(inode)) - return -EPERM; + if (!is_owner_or_cap(inode)) { + error = -EPERM; + goto err_out; + } /* * Ensure there are no outstanding leases on the file. */ error = break_lease(inode, flag); if (error) - return error; + goto err_out; if (flag & O_TRUNC) { error = get_write_access(inode); if (error) - return error; + goto err_out; /* * Refuse to truncate files with mandatory locks held on them. @@ -1581,12 +1586,17 @@ int may_open(struct path *path, int acc_mode, int flag) } put_write_access(inode); if (error) - return error; + goto err_out; } else if (flag & FMODE_WRITE) vfs_dq_init(inode); return 0; +err_out: + ima_counts_put(path, acc_mode ? + acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) : + ACC_MODE(flag) & (MAY_READ | MAY_WRITE)); + return error; } /* diff --git a/fs/namespace.c b/fs/namespace.c index 3dc283fd4716..7230787d18b0 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -22,6 +22,7 @@ #include <linux/seq_file.h> #include <linux/mnt_namespace.h> #include <linux/namei.h> +#include <linux/nsproxy.h> #include <linux/security.h> #include <linux/mount.h> #include <linux/ramfs.h> @@ -315,7 +316,8 @@ EXPORT_SYMBOL_GPL(mnt_clone_write); */ int mnt_want_write_file(struct file *file) { - if (!(file->f_mode & FMODE_WRITE)) + struct inode *inode = file->f_dentry->d_inode; + if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode)) return mnt_want_write(file->f_path.mnt); else return mnt_clone_write(file->f_path.mnt); diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 845159814de2..da7fda639eac 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -6,7 +6,8 @@ obj-$(CONFIG_NFS_FS) += nfs.o nfs-y := client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \ direct.o pagelist.o proc.o read.o symlink.o unlink.o \ - write.o namespace.o mount_clnt.o + write.o namespace.o mount_clnt.o \ + dns_resolve.o cache_lib.o nfs-$(CONFIG_ROOT_NFS) += nfsroot.o nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o nfs-$(CONFIG_NFS_V3_ACL) += nfs3acl.o diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c new file mode 100644 index 000000000000..b4ffd0146ea6 --- /dev/null +++ b/fs/nfs/cache_lib.c @@ -0,0 +1,140 @@ +/* + * linux/fs/nfs/cache_lib.c + * + * Helper routines for the NFS client caches + * + * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com> + */ +#include <linux/kmod.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/sunrpc/cache.h> +#include <linux/sunrpc/rpc_pipe_fs.h> + +#include "cache_lib.h" + +#define NFS_CACHE_UPCALL_PATHLEN 256 +#define NFS_CACHE_UPCALL_TIMEOUT 15 + +static char nfs_cache_getent_prog[NFS_CACHE_UPCALL_PATHLEN] = + "/sbin/nfs_cache_getent"; +static unsigned long nfs_cache_getent_timeout = NFS_CACHE_UPCALL_TIMEOUT; + +module_param_string(cache_getent, nfs_cache_getent_prog, + sizeof(nfs_cache_getent_prog), 0600); +MODULE_PARM_DESC(cache_getent, "Path to the client cache upcall program"); +module_param_named(cache_getent_timeout, nfs_cache_getent_timeout, ulong, 0600); +MODULE_PARM_DESC(cache_getent_timeout, "Timeout (in seconds) after which " + "the cache upcall is assumed to have failed"); + +int nfs_cache_upcall(struct cache_detail *cd, char *entry_name) +{ + static char *envp[] = { "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + NULL + }; + char *argv[] = { + nfs_cache_getent_prog, + cd->name, + entry_name, + NULL + }; + int ret = -EACCES; + + if (nfs_cache_getent_prog[0] == '\0') + goto out; + ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); + /* + * Disable the upcall mechanism if we're getting an ENOENT or + * EACCES error. The admin can re-enable it on the fly by using + * sysfs to set the 'cache_getent' parameter once the problem + * has been fixed. + */ + if (ret == -ENOENT || ret == -EACCES) + nfs_cache_getent_prog[0] = '\0'; +out: + return ret > 0 ? 0 : ret; +} + +/* + * Deferred request handling + */ +void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq) +{ + if (atomic_dec_and_test(&dreq->count)) + kfree(dreq); +} + +static void nfs_dns_cache_revisit(struct cache_deferred_req *d, int toomany) +{ + struct nfs_cache_defer_req *dreq; + + dreq = container_of(d, struct nfs_cache_defer_req, deferred_req); + + complete_all(&dreq->completion); + nfs_cache_defer_req_put(dreq); +} + +static struct cache_deferred_req *nfs_dns_cache_defer(struct cache_req *req) +{ + struct nfs_cache_defer_req *dreq; + + dreq = container_of(req, struct nfs_cache_defer_req, req); + dreq->deferred_req.revisit = nfs_dns_cache_revisit; + atomic_inc(&dreq->count); + + return &dreq->deferred_req; +} + +struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void) +{ + struct nfs_cache_defer_req *dreq; + + dreq = kzalloc(sizeof(*dreq), GFP_KERNEL); + if (dreq) { + init_completion(&dreq->completion); + atomic_set(&dreq->count, 1); + dreq->req.defer = nfs_dns_cache_defer; + } + return dreq; +} + +int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq) +{ + if (wait_for_completion_timeout(&dreq->completion, + nfs_cache_getent_timeout * HZ) == 0) + return -ETIMEDOUT; + return 0; +} + +int nfs_cache_register(struct cache_detail *cd) +{ + struct nameidata nd; + struct vfsmount *mnt; + int ret; + + mnt = rpc_get_mount(); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + ret = vfs_path_lookup(mnt->mnt_root, mnt, "/cache", 0, &nd); + if (ret) + goto err; + ret = sunrpc_cache_register_pipefs(nd.path.dentry, + cd->name, 0600, cd); + path_put(&nd.path); + if (!ret) + return ret; +err: + rpc_put_mount(); + return ret; +} + +void nfs_cache_unregister(struct cache_detail *cd) +{ + sunrpc_cache_unregister_pipefs(cd); + rpc_put_mount(); +} + diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h new file mode 100644 index 000000000000..76f856e284e4 --- /dev/null +++ b/fs/nfs/cache_lib.h @@ -0,0 +1,27 @@ +/* + * Helper routines for the NFS client caches + * + * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com> + */ + +#include <linux/completion.h> +#include <linux/sunrpc/cache.h> +#include <asm/atomic.h> + +/* + * Deferred request handling + */ +struct nfs_cache_defer_req { + struct cache_req req; + struct cache_deferred_req deferred_req; + struct completion completion; + atomic_t count; +}; + +extern int nfs_cache_upcall(struct cache_detail *cd, char *entry_name); +extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void); +extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq); +extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq); + +extern int nfs_cache_register(struct cache_detail *cd); +extern void nfs_cache_unregister(struct cache_detail *cd); diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 7f604c7941fb..293fa0528a6e 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -43,21 +43,29 @@ static struct svc_program nfs4_callback_program; unsigned int nfs_callback_set_tcpport; unsigned short nfs_callback_tcpport; unsigned short nfs_callback_tcpport6; -static const int nfs_set_port_min = 0; -static const int nfs_set_port_max = 65535; +#define NFS_CALLBACK_MAXPORTNR (65535U) -static int param_set_port(const char *val, struct kernel_param *kp) +static int param_set_portnr(const char *val, struct kernel_param *kp) { - char *endp; - int num = simple_strtol(val, &endp, 0); - if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max) + unsigned long num; + int ret; + + if (!val) + return -EINVAL; + ret = strict_strtoul(val, 0, &num); + if (ret == -EINVAL || num > NFS_CALLBACK_MAXPORTNR) return -EINVAL; - *((int *)kp->arg) = num; + *((unsigned int *)kp->arg) = num; return 0; } -module_param_call(callback_tcpport, param_set_port, param_get_int, - &nfs_callback_set_tcpport, 0644); +static int param_get_portnr(char *buffer, struct kernel_param *kp) +{ + return param_get_uint(buffer, kp); +} +#define param_check_portnr(name, p) __param_check(name, p, unsigned int); + +module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644); /* * This is the NFSv4 callback kernel thread. diff --git a/fs/nfs/client.c b/fs/nfs/client.c index c2d061675d80..e350bd6a2334 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -809,6 +809,9 @@ static int nfs_init_server(struct nfs_server *server, /* Initialise the client representation from the mount data */ server->flags = data->flags; server->options = data->options; + server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID| + NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP| + NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME; if (data->rsize) server->rsize = nfs_block_size(data->rsize, NULL); @@ -879,6 +882,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo * server->rsize = NFS_MAX_FILE_IO_SIZE; server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + server->backing_dev_info.name = "nfs"; server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD; if (server->wsize > max_rpc_payload) @@ -1074,10 +1078,6 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data, (unsigned long long) server->fsid.major, (unsigned long long) server->fsid.minor); - BUG_ON(!server->nfs_client); - BUG_ON(!server->nfs_client->rpc_ops); - BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); - spin_lock(&nfs_client_lock); list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks); list_add_tail(&server->master_link, &nfs_volume_list); @@ -1242,20 +1242,6 @@ error: return error; } -/* - * Initialize a session. - * Note: save the mount rsize and wsize for create_server negotiation. - */ -static void nfs4_init_session(struct nfs_client *clp, - unsigned int wsize, unsigned int rsize) -{ -#if defined(CONFIG_NFS_V4_1) - if (nfs4_has_session(clp)) { - clp->cl_session->fc_attrs.max_rqst_sz = wsize; - clp->cl_session->fc_attrs.max_resp_sz = rsize; - } -#endif /* CONFIG_NFS_V4_1 */ -} /* * Session has been established, and the client marked ready. @@ -1288,7 +1274,7 @@ static int nfs4_init_server(struct nfs_server *server, /* Initialise the client representation from the mount data */ server->flags = data->flags; - server->caps |= NFS_CAP_ATOMIC_OPEN; + server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR; server->options = data->options; /* Get a client record */ @@ -1350,7 +1336,9 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data, BUG_ON(!server->nfs_client->rpc_ops); BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); - nfs4_init_session(server->nfs_client, server->wsize, server->rsize); + error = nfs4_init_session(server); + if (error < 0) + goto error; /* Probe the root fh to retrieve its FSID */ error = nfs4_path_walk(server, mntfh, data->nfs_server.export_path); @@ -1371,10 +1359,6 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data, if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) server->namelen = NFS4_MAXNAMLEN; - BUG_ON(!server->nfs_client); - BUG_ON(!server->nfs_client->rpc_ops); - BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); - spin_lock(&nfs_client_lock); list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks); list_add_tail(&server->master_link, &nfs_volume_list); @@ -1412,7 +1396,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, /* Initialise the client representation from the parent server */ nfs_server_copy_userdata(server, parent_server); - server->caps |= NFS_CAP_ATOMIC_OPEN; + server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR; /* Get a client representation. * Note: NFSv4 always uses TCP, */ diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index af05b918cb5b..6dd48a4405b4 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -10,6 +10,7 @@ #include <linux/kthread.h> #include <linux/module.h> #include <linux/sched.h> +#include <linux/smp_lock.h> #include <linux/spinlock.h> #include <linux/nfs4.h> diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 89f98e9a024b..32062c33c859 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -29,7 +29,6 @@ #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> #include <linux/pagemap.h> -#include <linux/smp_lock.h> #include <linux/pagevec.h> #include <linux/namei.h> #include <linux/mount.h> @@ -1026,12 +1025,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry res = NULL; goto out; /* This turned out not to be a regular file */ - case -EISDIR: case -ENOTDIR: goto no_open; case -ELOOP: if (!(nd->intent.open.flags & O_NOFOLLOW)) goto no_open; + /* case -EISDIR: */ /* case -EINVAL: */ default: goto out; diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 489fc01a3204..6c3210099d51 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -255,7 +255,7 @@ static void nfs_direct_read_release(void *calldata) if (put_dreq(dreq)) nfs_direct_complete(dreq); - nfs_readdata_release(calldata); + nfs_readdata_free(data); } static const struct rpc_call_ops nfs_read_direct_ops = { @@ -314,14 +314,14 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq, data->npages, 1, 0, data->pagevec, NULL); up_read(¤t->mm->mmap_sem); if (result < 0) { - nfs_readdata_release(data); + nfs_readdata_free(data); break; } if ((unsigned)result < data->npages) { bytes = result * PAGE_SIZE; if (bytes <= pgbase) { nfs_direct_release_pages(data->pagevec, result); - nfs_readdata_release(data); + nfs_readdata_free(data); break; } bytes -= pgbase; @@ -334,7 +334,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq, data->inode = inode; data->cred = msg.rpc_cred; data->args.fh = NFS_FH(inode); - data->args.context = get_nfs_open_context(ctx); + data->args.context = ctx; data->args.offset = pos; data->args.pgbase = pgbase; data->args.pages = data->pagevec; @@ -441,7 +441,7 @@ static void nfs_direct_free_writedata(struct nfs_direct_req *dreq) struct nfs_write_data *data = list_entry(dreq->rewrite_list.next, struct nfs_write_data, pages); list_del(&data->pages); nfs_direct_release_pages(data->pagevec, data->npages); - nfs_writedata_release(data); + nfs_writedata_free(data); } } @@ -534,7 +534,7 @@ static void nfs_direct_commit_release(void *calldata) dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status); nfs_direct_write_complete(dreq, data->inode); - nfs_commitdata_release(calldata); + nfs_commit_free(data); } static const struct rpc_call_ops nfs_commit_direct_ops = { @@ -570,7 +570,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) data->args.fh = NFS_FH(data->inode); data->args.offset = 0; data->args.count = 0; - data->args.context = get_nfs_open_context(dreq->ctx); + data->args.context = dreq->ctx; data->res.count = 0; data->res.fattr = &data->fattr; data->res.verf = &data->verf; @@ -734,14 +734,14 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq, data->npages, 0, 0, data->pagevec, NULL); up_read(¤t->mm->mmap_sem); if (result < 0) { - nfs_writedata_release(data); + nfs_writedata_free(data); break; } if ((unsigned)result < data->npages) { bytes = result * PAGE_SIZE; if (bytes <= pgbase) { nfs_direct_release_pages(data->pagevec, result); - nfs_writedata_release(data); + nfs_writedata_free(data); break; } bytes -= pgbase; @@ -756,7 +756,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq, data->inode = inode; data->cred = msg.rpc_cred; data->args.fh = NFS_FH(inode); - data->args.context = get_nfs_open_context(ctx); + data->args.context = ctx; data->args.offset = pos; data->args.pgbase = pgbase; data->args.pages = data->pagevec; @@ -934,9 +934,6 @@ out: * back into its cache. We let the server do generic write * parameter checking and report problems. * - * We also avoid an unnecessary invocation of generic_osync_inode(), - * as it is fairly meaningless to sync the metadata of an NFS file. - * * We eliminate local atime updates, see direct read above. * * We avoid unnecessary page cache invalidations for normal cached diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c new file mode 100644 index 000000000000..f4d54ba97cc6 --- /dev/null +++ b/fs/nfs/dns_resolve.c @@ -0,0 +1,335 @@ +/* + * linux/fs/nfs/dns_resolve.c + * + * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com> + * + * Resolves DNS hostnames into valid ip addresses + */ + +#include <linux/hash.h> +#include <linux/string.h> +#include <linux/kmod.h> +#include <linux/module.h> +#include <linux/socket.h> +#include <linux/seq_file.h> +#include <linux/inet.h> +#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/cache.h> +#include <linux/sunrpc/svcauth.h> + +#include "dns_resolve.h" +#include "cache_lib.h" + +#define NFS_DNS_HASHBITS 4 +#define NFS_DNS_HASHTBL_SIZE (1 << NFS_DNS_HASHBITS) + +static struct cache_head *nfs_dns_table[NFS_DNS_HASHTBL_SIZE]; + +struct nfs_dns_ent { + struct cache_head h; + + char *hostname; + size_t namelen; + + struct sockaddr_storage addr; + size_t addrlen; +}; + + +static void nfs_dns_ent_init(struct cache_head *cnew, + struct cache_head *ckey) +{ + struct nfs_dns_ent *new; + struct nfs_dns_ent *key; + + new = container_of(cnew, struct nfs_dns_ent, h); + key = container_of(ckey, struct nfs_dns_ent, h); + + kfree(new->hostname); + new->hostname = kstrndup(key->hostname, key->namelen, GFP_KERNEL); + if (new->hostname) { + new->namelen = key->namelen; + memcpy(&new->addr, &key->addr, key->addrlen); + new->addrlen = key->addrlen; + } else { + new->namelen = 0; + new->addrlen = 0; + } +} + +static void nfs_dns_ent_put(struct kref *ref) +{ + struct nfs_dns_ent *item; + + item = container_of(ref, struct nfs_dns_ent, h.ref); + kfree(item->hostname); + kfree(item); +} + +static struct cache_head *nfs_dns_ent_alloc(void) +{ + struct nfs_dns_ent *item = kmalloc(sizeof(*item), GFP_KERNEL); + + if (item != NULL) { + item->hostname = NULL; + item->namelen = 0; + item->addrlen = 0; + return &item->h; + } + return NULL; +}; + +static unsigned int nfs_dns_hash(const struct nfs_dns_ent *key) +{ + return hash_str(key->hostname, NFS_DNS_HASHBITS); +} + +static void nfs_dns_request(struct cache_detail *cd, + struct cache_head *ch, + char **bpp, int *blen) +{ + struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h); + + qword_add(bpp, blen, key->hostname); + (*bpp)[-1] = '\n'; +} + +static int nfs_dns_upcall(struct cache_detail *cd, + struct cache_head *ch) +{ + struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h); + int ret; + + ret = nfs_cache_upcall(cd, key->hostname); + if (ret) + ret = sunrpc_cache_pipe_upcall(cd, ch, nfs_dns_request); + return ret; +} + +static int nfs_dns_match(struct cache_head *ca, + struct cache_head *cb) +{ + struct nfs_dns_ent *a; + struct nfs_dns_ent *b; + + a = container_of(ca, struct nfs_dns_ent, h); + b = container_of(cb, struct nfs_dns_ent, h); + + if (a->namelen == 0 || a->namelen != b->namelen) + return 0; + return memcmp(a->hostname, b->hostname, a->namelen) == 0; +} + +static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd, + struct cache_head *h) +{ + struct nfs_dns_ent *item; + long ttl; + + if (h == NULL) { + seq_puts(m, "# ip address hostname ttl\n"); + return 0; + } + item = container_of(h, struct nfs_dns_ent, h); + ttl = (long)item->h.expiry_time - (long)get_seconds(); + if (ttl < 0) + ttl = 0; + + if (!test_bit(CACHE_NEGATIVE, &h->flags)) { + char buf[INET6_ADDRSTRLEN+IPV6_SCOPE_ID_LEN+1]; + + rpc_ntop((struct sockaddr *)&item->addr, buf, sizeof(buf)); + seq_printf(m, "%15s ", buf); + } else + seq_puts(m, "<none> "); + seq_printf(m, "%15s %ld\n", item->hostname, ttl); + return 0; +} + +struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd, + struct nfs_dns_ent *key) +{ + struct cache_head *ch; + + ch = sunrpc_cache_lookup(cd, + &key->h, + nfs_dns_hash(key)); + if (!ch) + return NULL; + return container_of(ch, struct nfs_dns_ent, h); +} + +struct nfs_dns_ent *nfs_dns_update(struct cache_detail *cd, + struct nfs_dns_ent *new, + struct nfs_dns_ent *key) +{ + struct cache_head *ch; + + ch = sunrpc_cache_update(cd, + &new->h, &key->h, + nfs_dns_hash(key)); + if (!ch) + return NULL; + return container_of(ch, struct nfs_dns_ent, h); +} + +static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen) +{ + char buf1[NFS_DNS_HOSTNAME_MAXLEN+1]; + struct nfs_dns_ent key, *item; + unsigned long ttl; + ssize_t len; + int ret = -EINVAL; + + if (buf[buflen-1] != '\n') + goto out; + buf[buflen-1] = '\0'; + + len = qword_get(&buf, buf1, sizeof(buf1)); + if (len <= 0) + goto out; + key.addrlen = rpc_pton(buf1, len, + (struct sockaddr *)&key.addr, + sizeof(key.addr)); + + len = qword_get(&buf, buf1, sizeof(buf1)); + if (len <= 0) + goto out; + + key.hostname = buf1; + key.namelen = len; + memset(&key.h, 0, sizeof(key.h)); + + ttl = get_expiry(&buf); + if (ttl == 0) + goto out; + key.h.expiry_time = ttl + get_seconds(); + + ret = -ENOMEM; + item = nfs_dns_lookup(cd, &key); + if (item == NULL) + goto out; + + if (key.addrlen == 0) + set_bit(CACHE_NEGATIVE, &key.h.flags); + + item = nfs_dns_update(cd, &key, item); + if (item == NULL) + goto out; + + ret = 0; + cache_put(&item->h, cd); +out: + return ret; +} + +static struct cache_detail nfs_dns_resolve = { + .owner = THIS_MODULE, + .hash_size = NFS_DNS_HASHTBL_SIZE, + .hash_table = nfs_dns_table, + .name = "dns_resolve", + .cache_put = nfs_dns_ent_put, + .cache_upcall = nfs_dns_upcall, + .cache_parse = nfs_dns_parse, + .cache_show = nfs_dns_show, + .match = nfs_dns_match, + .init = nfs_dns_ent_init, + .update = nfs_dns_ent_init, + .alloc = nfs_dns_ent_alloc, +}; + +static int do_cache_lookup(struct cache_detail *cd, + struct nfs_dns_ent *key, + struct nfs_dns_ent **item, + struct nfs_cache_defer_req *dreq) +{ + int ret = -ENOMEM; + + *item = nfs_dns_lookup(cd, key); + if (*item) { + ret = cache_check(cd, &(*item)->h, &dreq->req); + if (ret) + *item = NULL; + } + return ret; +} + +static int do_cache_lookup_nowait(struct cache_detail *cd, + struct nfs_dns_ent *key, + struct nfs_dns_ent **item) +{ + int ret = -ENOMEM; + + *item = nfs_dns_lookup(cd, key); + if (!*item) + goto out_err; + ret = -ETIMEDOUT; + if (!test_bit(CACHE_VALID, &(*item)->h.flags) + || (*item)->h.expiry_time < get_seconds() + || cd->flush_time > (*item)->h.last_refresh) + goto out_put; + ret = -ENOENT; + if (test_bit(CACHE_NEGATIVE, &(*item)->h.flags)) + goto out_put; + return 0; +out_put: + cache_put(&(*item)->h, cd); +out_err: + *item = NULL; + return ret; +} + +static int do_cache_lookup_wait(struct cache_detail *cd, + struct nfs_dns_ent *key, + struct nfs_dns_ent **item) +{ + struct nfs_cache_defer_req *dreq; + int ret = -ENOMEM; + + dreq = nfs_cache_defer_req_alloc(); + if (!dreq) + goto out; + ret = do_cache_lookup(cd, key, item, dreq); + if (ret == -EAGAIN) { + ret = nfs_cache_wait_for_upcall(dreq); + if (!ret) + ret = do_cache_lookup_nowait(cd, key, item); + } + nfs_cache_defer_req_put(dreq); +out: + return ret; +} + +ssize_t nfs_dns_resolve_name(char *name, size_t namelen, + struct sockaddr *sa, size_t salen) +{ + struct nfs_dns_ent key = { + .hostname = name, + .namelen = namelen, + }; + struct nfs_dns_ent *item = NULL; + ssize_t ret; + + ret = do_cache_lookup_wait(&nfs_dns_resolve, &key, &item); + if (ret == 0) { + if (salen >= item->addrlen) { + memcpy(sa, &item->addr, item->addrlen); + ret = item->addrlen; + } else + ret = -EOVERFLOW; + cache_put(&item->h, &nfs_dns_resolve); + } else if (ret == -ENOENT) + ret = -ESRCH; + return ret; +} + +int nfs_dns_resolver_init(void) +{ + return nfs_cache_register(&nfs_dns_resolve); +} + +void nfs_dns_resolver_destroy(void) +{ + nfs_cache_unregister(&nfs_dns_resolve); +} + diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h new file mode 100644 index 000000000000..a3f0938babf7 --- /dev/null +++ b/fs/nfs/dns_resolve.h @@ -0,0 +1,14 @@ +/* + * Resolve DNS hostnames into valid ip addresses + */ +#ifndef __LINUX_FS_NFS_DNS_RESOLVE_H +#define __LINUX_FS_NFS_DNS_RESOLVE_H + +#define NFS_DNS_HOSTNAME_MAXLEN (128) + +extern int nfs_dns_resolver_init(void); +extern void nfs_dns_resolver_destroy(void); +extern ssize_t nfs_dns_resolve_name(char *name, size_t namelen, + struct sockaddr *sa, size_t salen); + +#endif diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 0055b813ec2c..5021b75d2d1e 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -26,7 +26,6 @@ #include <linux/mm.h> #include <linux/slab.h> #include <linux/pagemap.h> -#include <linux/smp_lock.h> #include <linux/aio.h> #include <asm/uaccess.h> @@ -329,6 +328,42 @@ nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync) } /* + * Decide whether a read/modify/write cycle may be more efficient + * then a modify/write/read cycle when writing to a page in the + * page cache. + * + * The modify/write/read cycle may occur if a page is read before + * being completely filled by the writer. In this situation, the + * page must be completely written to stable storage on the server + * before it can be refilled by reading in the page from the server. + * This can lead to expensive, small, FILE_SYNC mode writes being + * done. + * + * It may be more efficient to read the page first if the file is + * open for reading in addition to writing, the page is not marked + * as Uptodate, it is not dirty or waiting to be committed, + * indicating that it was previously allocated and then modified, + * that there were valid bytes of data in that range of the file, + * and that the new data won't completely replace the old data in + * that range of the file. + */ +static int nfs_want_read_modify_write(struct file *file, struct page *page, + loff_t pos, unsigned len) +{ + unsigned int pglen = nfs_page_length(page); + unsigned int offset = pos & (PAGE_CACHE_SIZE - 1); + unsigned int end = offset + len; + + if ((file->f_mode & FMODE_READ) && /* open for read? */ + !PageUptodate(page) && /* Uptodate? */ + !PagePrivate(page) && /* i/o request already? */ + pglen && /* valid bytes of file? */ + (end < pglen || offset)) /* replace all valid bytes? */ + return 1; + return 0; +} + +/* * This does the "real" work of the write. We must allocate and lock the * page to be sent back to the generic routine, which then copies the * data from user space. @@ -341,15 +376,16 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { int ret; - pgoff_t index; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; struct page *page; - index = pos >> PAGE_CACHE_SHIFT; + int once_thru = 0; dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n", file->f_path.dentry->d_parent->d_name.name, file->f_path.dentry->d_name.name, mapping->host->i_ino, len, (long long) pos); +start: /* * Prevent starvation issues if someone is doing a consistency * sync-to-disk @@ -368,6 +404,13 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, if (ret) { unlock_page(page); page_cache_release(page); + } else if (!once_thru && + nfs_want_read_modify_write(file, page, pos, len)) { + once_thru = 1; + ret = nfs_readpage(file, page); + page_cache_release(page); + if (!ret) + goto start; } return ret; } @@ -480,6 +523,7 @@ const struct address_space_operations nfs_file_aops = { .invalidatepage = nfs_invalidate_page, .releasepage = nfs_release_page, .direct_IO = nfs_direct_IO, + .migratepage = nfs_migrate_page, .launder_page = nfs_launder_page, }; diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index 46177cb87064..b35d2a616066 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -30,7 +30,6 @@ #include <linux/nfs_idmap.h> #include <linux/vfs.h> #include <linux/namei.h> -#include <linux/mnt_namespace.h> #include <linux/security.h> #include <asm/system.h> diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 86147b0ab2cf..21a84d45916f 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -101,7 +101,7 @@ static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *); static unsigned int fnvhash32(const void *, size_t); -static struct rpc_pipe_ops idmap_upcall_ops = { +static const struct rpc_pipe_ops idmap_upcall_ops = { .upcall = idmap_pipe_upcall, .downcall = idmap_pipe_downcall, .destroy_msg = idmap_pipe_destroy_msg, @@ -119,8 +119,8 @@ nfs_idmap_new(struct nfs_client *clp) if (idmap == NULL) return -ENOMEM; - idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_dentry, "idmap", - idmap, &idmap_upcall_ops, 0); + idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_path.dentry, + "idmap", idmap, &idmap_upcall_ops, 0); if (IS_ERR(idmap->idmap_dentry)) { error = PTR_ERR(idmap->idmap_dentry); kfree(idmap); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 64f87194d390..060022b4651c 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -30,7 +30,6 @@ #include <linux/nfs_mount.h> #include <linux/nfs4_mount.h> #include <linux/lockd/bind.h> -#include <linux/smp_lock.h> #include <linux/seq_file.h> #include <linux/mount.h> #include <linux/nfs_idmap.h> @@ -47,6 +46,7 @@ #include "iostat.h" #include "internal.h" #include "fscache.h" +#include "dns_resolve.h" #define NFSDBG_FACILITY NFSDBG_VFS @@ -287,6 +287,11 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) /* We can't support update_atime(), since the server will reset it */ inode->i_flags |= S_NOATIME|S_NOCMTIME; inode->i_mode = fattr->mode; + if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0 + && nfs_server_capable(inode, NFS_CAP_MODE)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL; /* Why so? Because we want revalidate for devices/FIFOs, and * that's precisely what we have in nfs_file_inode_operations. */ @@ -331,20 +336,46 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) nfsi->attr_gencount = fattr->gencount; if (fattr->valid & NFS_ATTR_FATTR_ATIME) inode->i_atime = fattr->atime; + else if (nfs_server_capable(inode, NFS_CAP_ATIME)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR; if (fattr->valid & NFS_ATTR_FATTR_MTIME) inode->i_mtime = fattr->mtime; + else if (nfs_server_capable(inode, NFS_CAP_MTIME)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_DATA; if (fattr->valid & NFS_ATTR_FATTR_CTIME) inode->i_ctime = fattr->ctime; + else if (nfs_server_capable(inode, NFS_CAP_CTIME)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL; if (fattr->valid & NFS_ATTR_FATTR_CHANGE) nfsi->change_attr = fattr->change_attr; + else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_DATA; if (fattr->valid & NFS_ATTR_FATTR_SIZE) inode->i_size = nfs_size_to_loff_t(fattr->size); + else + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_DATA + | NFS_INO_REVAL_PAGECACHE; if (fattr->valid & NFS_ATTR_FATTR_NLINK) inode->i_nlink = fattr->nlink; + else if (nfs_server_capable(inode, NFS_CAP_NLINK)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR; if (fattr->valid & NFS_ATTR_FATTR_OWNER) inode->i_uid = fattr->uid; + else if (nfs_server_capable(inode, NFS_CAP_OWNER)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL; if (fattr->valid & NFS_ATTR_FATTR_GROUP) inode->i_gid = fattr->gid; + else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL; if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) inode->i_blocks = fattr->du.nfs2.blocks; if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { @@ -1146,6 +1177,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) loff_t cur_isize, new_isize; unsigned long invalid = 0; unsigned long now = jiffies; + unsigned long save_cache_validity; dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n", __func__, inode->i_sb->s_id, inode->i_ino, @@ -1172,10 +1204,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) */ nfsi->read_cache_jiffies = fattr->time_start; - if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) || (fattr->valid & (NFS_ATTR_FATTR_MTIME|NFS_ATTR_FATTR_CTIME))) - nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_ATIME - | NFS_INO_REVAL_PAGECACHE); + save_cache_validity = nfsi->cache_validity; + nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ATIME + | NFS_INO_REVAL_FORCED + | NFS_INO_REVAL_PAGECACHE); /* Do atomic weak cache consistency updates */ nfs_wcc_update_inode(inode, fattr); @@ -1190,7 +1223,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_force_lookup_revalidate(inode); nfsi->change_attr = fattr->change_attr; } - } + } else if (server->caps & NFS_CAP_CHANGE_ATTR) + invalid |= save_cache_validity; if (fattr->valid & NFS_ATTR_FATTR_MTIME) { /* NFSv2/v3: Check if the mtime agrees */ @@ -1202,7 +1236,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_force_lookup_revalidate(inode); memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); } - } + } else if (server->caps & NFS_CAP_MTIME) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_DATA + | NFS_INO_REVAL_PAGECACHE + | NFS_INO_REVAL_FORCED); + if (fattr->valid & NFS_ATTR_FATTR_CTIME) { /* If ctime has changed we should definitely clear access+acl caches */ if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) { @@ -1216,7 +1255,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); } - } + } else if (server->caps & NFS_CAP_CTIME) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_FORCED); /* Check if our cached file size is stale */ if (fattr->valid & NFS_ATTR_FATTR_SIZE) { @@ -1232,30 +1275,50 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) dprintk("NFS: isize change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); } - } + } else + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_REVAL_PAGECACHE + | NFS_INO_REVAL_FORCED); if (fattr->valid & NFS_ATTR_FATTR_ATIME) memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); + else if (server->caps & NFS_CAP_ATIME) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATIME + | NFS_INO_REVAL_FORCED); if (fattr->valid & NFS_ATTR_FATTR_MODE) { if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) { invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; inode->i_mode = fattr->mode; } - } + } else if (server->caps & NFS_CAP_MODE) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_FORCED); + if (fattr->valid & NFS_ATTR_FATTR_OWNER) { if (inode->i_uid != fattr->uid) { invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; inode->i_uid = fattr->uid; } - } + } else if (server->caps & NFS_CAP_OWNER) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_FORCED); + if (fattr->valid & NFS_ATTR_FATTR_GROUP) { if (inode->i_gid != fattr->gid) { invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; inode->i_gid = fattr->gid; } - } + } else if (server->caps & NFS_CAP_OWNER_GROUP) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_FORCED); if (fattr->valid & NFS_ATTR_FATTR_NLINK) { if (inode->i_nlink != fattr->nlink) { @@ -1264,7 +1327,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) invalid |= NFS_INO_INVALID_DATA; inode->i_nlink = fattr->nlink; } - } + } else if (server->caps & NFS_CAP_NLINK) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_REVAL_FORCED); if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { /* @@ -1294,9 +1359,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) || S_ISLNK(inode->i_mode))) invalid &= ~NFS_INO_INVALID_DATA; if (!nfs_have_delegation(inode, FMODE_READ) || - (nfsi->cache_validity & NFS_INO_REVAL_FORCED)) + (save_cache_validity & NFS_INO_REVAL_FORCED)) nfsi->cache_validity |= invalid; - nfsi->cache_validity &= ~NFS_INO_REVAL_FORCED; return 0; out_changed: @@ -1443,6 +1507,10 @@ static int __init init_nfs_fs(void) { int err; + err = nfs_dns_resolver_init(); + if (err < 0) + goto out8; + err = nfs_fscache_register(); if (err < 0) goto out7; @@ -1501,6 +1569,8 @@ out5: out6: nfs_fscache_unregister(); out7: + nfs_dns_resolver_destroy(); +out8: return err; } @@ -1512,6 +1582,7 @@ static void __exit exit_nfs_fs(void) nfs_destroy_inodecache(); nfs_destroy_nfspagecache(); nfs_fscache_unregister(); + nfs_dns_resolver_destroy(); #ifdef CONFIG_PROC_FS rpc_proc_unregister("nfs"); #endif diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 7dd90a6769d0..e21b1bb9972f 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -49,6 +49,11 @@ struct nfs_clone_mount { #define NFS_MAX_SECFLAVORS (12) /* + * Value used if the user did not specify a port value. + */ +#define NFS_UNSPEC_PORT (-1) + +/* * In-kernel mount arguments */ struct nfs_parsed_mount_data { @@ -63,6 +68,7 @@ struct nfs_parsed_mount_data { unsigned int auth_flavor_len; rpc_authflavor_t auth_flavors[1]; char *client_address; + unsigned int version; unsigned int minorversion; char *fscache_uniq; @@ -71,7 +77,7 @@ struct nfs_parsed_mount_data { size_t addrlen; char *hostname; u32 version; - unsigned short port; + int port; unsigned short protocol; } mount_server; @@ -80,7 +86,7 @@ struct nfs_parsed_mount_data { size_t addrlen; char *hostname; char *export_path; - unsigned short port; + int port; unsigned short protocol; } nfs_server; @@ -102,6 +108,7 @@ struct nfs_mount_request { }; extern int nfs_mount(struct nfs_mount_request *info); +extern void nfs_umount(const struct nfs_mount_request *info); /* client.c */ extern struct rpc_program nfs_program; @@ -213,7 +220,6 @@ void nfs_zap_acl_cache(struct inode *inode); extern int nfs_wait_bit_killable(void *word); /* super.c */ -void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *); extern struct file_system_type nfs_xdev_fs_type; #ifdef CONFIG_NFS_V4 extern struct file_system_type nfs4_xdev_fs_type; @@ -248,6 +254,12 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata); /* write.c */ extern void nfs_write_prepare(struct rpc_task *task, void *calldata); +#ifdef CONFIG_MIGRATION +extern int nfs_migrate_page(struct address_space *, + struct page *, struct page *); +#else +#define nfs_migrate_page NULL +#endif /* nfs4proc.c */ extern int _nfs4_call_sync(struct nfs_server *server, @@ -368,24 +380,3 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len) return ((unsigned long)len + (unsigned long)base + PAGE_SIZE - 1) >> PAGE_SHIFT; } - -#define IPV6_SCOPE_DELIMITER '%' - -/* - * Set the port number in an address. Be agnostic about the address - * family. - */ -static inline void nfs_set_port(struct sockaddr *sap, unsigned short port) -{ - struct sockaddr_in *ap = (struct sockaddr_in *)sap; - struct sockaddr_in6 *ap6 = (struct sockaddr_in6 *)sap; - - switch (sap->sa_family) { - case AF_INET: - ap->sin_port = htons(port); - break; - case AF_INET6: - ap6->sin6_port = htons(port); - break; - } -} diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 38ef9eaec407..0adefc40cc89 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -209,6 +209,71 @@ out_mnt_err: goto out; } +/** + * nfs_umount - Notify a server that we have unmounted this export + * @info: pointer to umount request arguments + * + * MOUNTPROC_UMNT is advisory, so we set a short timeout, and always + * use UDP. + */ +void nfs_umount(const struct nfs_mount_request *info) +{ + static const struct rpc_timeout nfs_umnt_timeout = { + .to_initval = 1 * HZ, + .to_maxval = 3 * HZ, + .to_retries = 2, + }; + struct rpc_create_args args = { + .protocol = IPPROTO_UDP, + .address = info->sap, + .addrsize = info->salen, + .timeout = &nfs_umnt_timeout, + .servername = info->hostname, + .program = &mnt_program, + .version = info->version, + .authflavor = RPC_AUTH_UNIX, + .flags = RPC_CLNT_CREATE_NOPING, + }; + struct mountres result; + struct rpc_message msg = { + .rpc_argp = info->dirpath, + .rpc_resp = &result, + }; + struct rpc_clnt *clnt; + int status; + + if (info->noresvport) + args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; + + clnt = rpc_create(&args); + if (unlikely(IS_ERR(clnt))) + goto out_clnt_err; + + dprintk("NFS: sending UMNT request for %s:%s\n", + (info->hostname ? info->hostname : "server"), info->dirpath); + + if (info->version == NFS_MNT3_VERSION) + msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC3_UMNT]; + else + msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC_UMNT]; + + status = rpc_call_sync(clnt, &msg, 0); + rpc_shutdown_client(clnt); + + if (unlikely(status < 0)) + goto out_call_err; + + return; + +out_clnt_err: + dprintk("NFS: failed to create UMNT RPC client, status=%ld\n", + PTR_ERR(clnt)); + return; + +out_call_err: + dprintk("NFS: UMNT request failed, status=%d\n", status); +} + /* * XDR encode/decode functions for MOUNT */ @@ -258,7 +323,7 @@ static int decode_status(struct xdr_stream *xdr, struct mountres *res) return -EIO; status = ntohl(*p); - for (i = 0; i <= ARRAY_SIZE(mnt_errtbl); i++) { + for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) { if (mnt_errtbl[i].status == status) { res->errno = mnt_errtbl[i].errno; return 0; @@ -309,7 +374,7 @@ static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res) return -EIO; status = ntohl(*p); - for (i = 0; i <= ARRAY_SIZE(mnt3_errtbl); i++) { + for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) { if (mnt3_errtbl[i].status == status) { res->errno = mnt3_errtbl[i].errno; return 0; @@ -407,6 +472,13 @@ static struct rpc_procinfo mnt_procedures[] = { .p_statidx = MOUNTPROC_MNT, .p_name = "MOUNT", }, + [MOUNTPROC_UMNT] = { + .p_proc = MOUNTPROC_UMNT, + .p_encode = (kxdrproc_t)mnt_enc_dirpath, + .p_arglen = MNT_enc_dirpath_sz, + .p_statidx = MOUNTPROC_UMNT, + .p_name = "UMOUNT", + }, }; static struct rpc_procinfo mnt3_procedures[] = { @@ -419,6 +491,13 @@ static struct rpc_procinfo mnt3_procedures[] = { .p_statidx = MOUNTPROC3_MNT, .p_name = "MOUNT", }, + [MOUNTPROC3_UMNT] = { + .p_proc = MOUNTPROC3_UMNT, + .p_encode = (kxdrproc_t)mnt_enc_dirpath, + .p_arglen = MNT_enc_dirpath_sz, + .p_statidx = MOUNTPROC3_UMNT, + .p_name = "UMOUNT", + }, }; diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index d0cc5ce0edfe..ee6a13f05443 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -299,7 +299,6 @@ static void nfs3_free_createdata(struct nfs3_createdata *data) /* * Create a regular file. - * For now, we don't implement O_EXCL. */ static int nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 61bc3a32e1e2..6ea07a3c75d4 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -220,6 +220,7 @@ extern void nfs4_destroy_session(struct nfs4_session *session); extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); extern int nfs4_proc_create_session(struct nfs_client *, int reset); extern int nfs4_proc_destroy_session(struct nfs4_session *); +extern int nfs4_init_session(struct nfs_server *server); #else /* CONFIG_NFS_v4_1 */ static inline int nfs4_setup_sequence(struct nfs_client *clp, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, @@ -227,6 +228,11 @@ static inline int nfs4_setup_sequence(struct nfs_client *clp, { return 0; } + +static inline int nfs4_init_session(struct nfs_server *server) +{ + return 0; +} #endif /* CONFIG_NFS_V4_1 */ extern struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[]; diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 2a2a0a7143ad..2636c26d56fa 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -17,6 +17,7 @@ #include <linux/inet.h> #include "internal.h" #include "nfs4_fs.h" +#include "dns_resolve.h" #define NFSDBG_FACILITY NFSDBG_VFS @@ -95,6 +96,20 @@ static int nfs4_validate_fspath(const struct vfsmount *mnt_parent, return 0; } +static size_t nfs_parse_server_name(char *string, size_t len, + struct sockaddr *sa, size_t salen) +{ + ssize_t ret; + + ret = rpc_pton(string, len, sa, salen); + if (ret == 0) { + ret = nfs_dns_resolve_name(string, len, sa, salen); + if (ret < 0) + ret = 0; + } + return ret; +} + static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, char *page, char *page2, const struct nfs4_fs_location *location) @@ -121,11 +136,12 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len)) continue; - nfs_parse_ip_address(buf->data, buf->len, - mountdata->addr, &mountdata->addrlen); - if (mountdata->addr->sa_family == AF_UNSPEC) + mountdata->addrlen = nfs_parse_server_name(buf->data, + buf->len, + mountdata->addr, mountdata->addrlen); + if (mountdata->addrlen == 0) continue; - nfs_set_port(mountdata->addr, NFS_PORT); + rpc_set_port(mountdata->addr, NFS_PORT); memcpy(page2, buf->data, buf->len); page2[buf->len] = '\0'; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 92ce43517814..be6544aef41f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -45,7 +45,6 @@ #include <linux/nfs4.h> #include <linux/nfs_fs.h> #include <linux/nfs_page.h> -#include <linux/smp_lock.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/module.h> @@ -62,6 +61,8 @@ #define NFS4_POLL_RETRY_MIN (HZ/10) #define NFS4_POLL_RETRY_MAX (15*HZ) +#define NFS4_MAX_LOOP_ON_RECOVER (10) + struct nfs4_opendata; static int _nfs4_proc_open(struct nfs4_opendata *data); static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); @@ -427,17 +428,19 @@ out: static int nfs4_recover_session(struct nfs4_session *session) { struct nfs_client *clp = session->clp; + unsigned int loop; int ret; - for (;;) { + for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) { ret = nfs4_wait_clnt_recover(clp); if (ret != 0) - return ret; + break; if (!test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)) break; nfs4_schedule_state_manager(clp); + ret = -EIO; } - return 0; + return ret; } static int nfs41_setup_sequence(struct nfs4_session *session, @@ -1445,18 +1448,20 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) static int nfs4_recover_expired_lease(struct nfs_server *server) { struct nfs_client *clp = server->nfs_client; + unsigned int loop; int ret; - for (;;) { + for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) { ret = nfs4_wait_clnt_recover(clp); if (ret != 0) - return ret; + break; if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) && !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state)) break; nfs4_schedule_state_recovery(clp); + ret = -EIO; } - return 0; + return ret; } /* @@ -1998,12 +2003,34 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f status = nfs4_call_sync(server, &msg, &args, &res, 0); if (status == 0) { memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask)); + server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS| + NFS_CAP_SYMLINKS|NFS_CAP_FILEID| + NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER| + NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME| + NFS_CAP_CTIME|NFS_CAP_MTIME); if (res.attr_bitmask[0] & FATTR4_WORD0_ACL) server->caps |= NFS_CAP_ACLS; if (res.has_links != 0) server->caps |= NFS_CAP_HARDLINKS; if (res.has_symlinks != 0) server->caps |= NFS_CAP_SYMLINKS; + if (res.attr_bitmask[0] & FATTR4_WORD0_FILEID) + server->caps |= NFS_CAP_FILEID; + if (res.attr_bitmask[1] & FATTR4_WORD1_MODE) + server->caps |= NFS_CAP_MODE; + if (res.attr_bitmask[1] & FATTR4_WORD1_NUMLINKS) + server->caps |= NFS_CAP_NLINK; + if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER) + server->caps |= NFS_CAP_OWNER; + if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER_GROUP) + server->caps |= NFS_CAP_OWNER_GROUP; + if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_ACCESS) + server->caps |= NFS_CAP_ATIME; + if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_METADATA) + server->caps |= NFS_CAP_CTIME; + if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY) + server->caps |= NFS_CAP_MTIME; + memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask)); server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE; server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; @@ -2041,15 +2068,9 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_argp = &args, .rpc_resp = &res, }; - int status; nfs_fattr_init(info->fattr); - status = nfs4_recover_expired_lease(server); - if (!status) - status = nfs4_check_client_ready(server->nfs_client); - if (!status) - status = nfs4_call_sync(server, &msg, &args, &res, 0); - return status; + return nfs4_call_sync(server, &msg, &args, &res, 0); } static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, @@ -4100,15 +4121,23 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) if (request->fl_start < 0 || request->fl_end < 0) return -EINVAL; - if (IS_GETLK(cmd)) - return nfs4_proc_getlk(state, F_GETLK, request); + if (IS_GETLK(cmd)) { + if (state != NULL) + return nfs4_proc_getlk(state, F_GETLK, request); + return 0; + } if (!(IS_SETLK(cmd) || IS_SETLKW(cmd))) return -EINVAL; - if (request->fl_type == F_UNLCK) - return nfs4_proc_unlck(state, cmd, request); + if (request->fl_type == F_UNLCK) { + if (state != NULL) + return nfs4_proc_unlck(state, cmd, request); + return 0; + } + if (state == NULL) + return -ENOLCK; do { status = nfs4_proc_setlk(state, cmd, request); if ((status != -EAGAIN) || IS_SETLK(cmd)) @@ -4794,6 +4823,22 @@ int nfs4_proc_destroy_session(struct nfs4_session *session) return status; } +int nfs4_init_session(struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + int ret; + + if (!nfs4_has_session(clp)) + return 0; + + clp->cl_session->fc_attrs.max_rqst_sz = server->wsize; + clp->cl_session->fc_attrs.max_resp_sz = server->rsize; + ret = nfs4_recover_expired_lease(server); + if (!ret) + ret = nfs4_check_client_ready(clp); + return ret; +} + /* * Renew the cl_session lease. */ diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index b73c5a728655..1434080aefeb 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -553,6 +553,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f INIT_LIST_HEAD(&lsp->ls_sequence.list); lsp->ls_seqid.sequence = &lsp->ls_sequence; atomic_set(&lsp->ls_count, 1); + lsp->ls_state = state; lsp->ls_owner = fl_owner; spin_lock(&clp->cl_lock); nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64); @@ -587,7 +588,6 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_ if (lsp != NULL) break; if (new != NULL) { - new->ls_state = state; list_add(&new->ls_locks, &state->lock_states); set_bit(LK_STATE_IN_USE, &state->flags); lsp = new; @@ -1250,8 +1250,8 @@ static void nfs4_state_manager(struct nfs_client *clp) continue; } /* Initialize or reset the session */ - if (nfs4_has_session(clp) && - test_and_clear_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)) { + if (test_and_clear_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state) + && nfs4_has_session(clp)) { if (clp->cl_cons_state == NFS_CS_SESSION_INITING) status = nfs4_initialize_session(clp); else diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 617273e7d47f..cfc30d362f94 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -702,29 +702,12 @@ struct compound_hdr { u32 minorversion; }; -/* - * START OF "GENERIC" ENCODE ROUTINES. - * These may look a little ugly since they are imported from a "generic" - * set of XDR encode/decode routines which are intended to be shared by - * all of our NFSv4 implementations (OpenBSD, MacOS X...). - * - * If the pain of reading these is too great, it should be a straightforward - * task to translate them into Linux-specific versions which are more - * consistent with the style used in NFSv2/v3... - */ -#define WRITE32(n) *p++ = htonl(n) -#define WRITE64(n) do { \ - *p++ = htonl((uint32_t)((n) >> 32)); \ - *p++ = htonl((uint32_t)(n)); \ -} while (0) -#define WRITEMEM(ptr,nbytes) do { \ - p = xdr_encode_opaque_fixed(p, ptr, nbytes); \ -} while (0) - -#define RESERVE_SPACE(nbytes) do { \ - p = xdr_reserve_space(xdr, nbytes); \ - BUG_ON(!p); \ -} while (0) +static __be32 *reserve_space(struct xdr_stream *xdr, size_t nbytes) +{ + __be32 *p = xdr_reserve_space(xdr, nbytes); + BUG_ON(!p); + return p; +} static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) { @@ -749,12 +732,11 @@ static void encode_compound_hdr(struct xdr_stream *xdr, dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag); BUG_ON(hdr->taglen > NFS4_MAXTAGLEN); - RESERVE_SPACE(12+(XDR_QUADLEN(hdr->taglen)<<2)); - WRITE32(hdr->taglen); - WRITEMEM(hdr->tag, hdr->taglen); - WRITE32(hdr->minorversion); + p = reserve_space(xdr, 4 + hdr->taglen + 8); + p = xdr_encode_opaque(p, hdr->tag, hdr->taglen); + *p++ = cpu_to_be32(hdr->minorversion); hdr->nops_p = p; - WRITE32(hdr->nops); + *p = cpu_to_be32(hdr->nops); } static void encode_nops(struct compound_hdr *hdr) @@ -829,55 +811,53 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const len += 16; else if (iap->ia_valid & ATTR_MTIME) len += 4; - RESERVE_SPACE(len); + p = reserve_space(xdr, len); /* * We write the bitmap length now, but leave the bitmap and the attribute * buffer length to be backfilled at the end of this routine. */ - WRITE32(2); + *p++ = cpu_to_be32(2); q = p; p += 3; if (iap->ia_valid & ATTR_SIZE) { bmval0 |= FATTR4_WORD0_SIZE; - WRITE64(iap->ia_size); + p = xdr_encode_hyper(p, iap->ia_size); } if (iap->ia_valid & ATTR_MODE) { bmval1 |= FATTR4_WORD1_MODE; - WRITE32(iap->ia_mode & S_IALLUGO); + *p++ = cpu_to_be32(iap->ia_mode & S_IALLUGO); } if (iap->ia_valid & ATTR_UID) { bmval1 |= FATTR4_WORD1_OWNER; - WRITE32(owner_namelen); - WRITEMEM(owner_name, owner_namelen); + p = xdr_encode_opaque(p, owner_name, owner_namelen); } if (iap->ia_valid & ATTR_GID) { bmval1 |= FATTR4_WORD1_OWNER_GROUP; - WRITE32(owner_grouplen); - WRITEMEM(owner_group, owner_grouplen); + p = xdr_encode_opaque(p, owner_group, owner_grouplen); } if (iap->ia_valid & ATTR_ATIME_SET) { bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET; - WRITE32(NFS4_SET_TO_CLIENT_TIME); - WRITE32(0); - WRITE32(iap->ia_mtime.tv_sec); - WRITE32(iap->ia_mtime.tv_nsec); + *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(iap->ia_mtime.tv_sec); + *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec); } else if (iap->ia_valid & ATTR_ATIME) { bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET; - WRITE32(NFS4_SET_TO_SERVER_TIME); + *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } if (iap->ia_valid & ATTR_MTIME_SET) { bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET; - WRITE32(NFS4_SET_TO_CLIENT_TIME); - WRITE32(0); - WRITE32(iap->ia_mtime.tv_sec); - WRITE32(iap->ia_mtime.tv_nsec); + *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(iap->ia_mtime.tv_sec); + *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec); } else if (iap->ia_valid & ATTR_MTIME) { bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET; - WRITE32(NFS4_SET_TO_SERVER_TIME); + *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } /* @@ -891,7 +871,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const len = (char *)p - (char *)q - 12; *q++ = htonl(bmval0); *q++ = htonl(bmval1); - *q++ = htonl(len); + *q = htonl(len); /* out: */ } @@ -900,9 +880,9 @@ static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hd { __be32 *p; - RESERVE_SPACE(8); - WRITE32(OP_ACCESS); - WRITE32(access); + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(OP_ACCESS); + *p = cpu_to_be32(access); hdr->nops++; hdr->replen += decode_access_maxsz; } @@ -911,10 +891,10 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg { __be32 *p; - RESERVE_SPACE(8+NFS4_STATEID_SIZE); - WRITE32(OP_CLOSE); - WRITE32(arg->seqid->sequence->counter); - WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); + p = reserve_space(xdr, 8+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_CLOSE); + *p++ = cpu_to_be32(arg->seqid->sequence->counter); + xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); hdr->nops++; hdr->replen += decode_close_maxsz; } @@ -923,10 +903,10 @@ static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *ar { __be32 *p; - RESERVE_SPACE(16); - WRITE32(OP_COMMIT); - WRITE64(args->offset); - WRITE32(args->count); + p = reserve_space(xdr, 16); + *p++ = cpu_to_be32(OP_COMMIT); + p = xdr_encode_hyper(p, args->offset); + *p = cpu_to_be32(args->count); hdr->nops++; hdr->replen += decode_commit_maxsz; } @@ -935,30 +915,28 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg * { __be32 *p; - RESERVE_SPACE(8); - WRITE32(OP_CREATE); - WRITE32(create->ftype); + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(OP_CREATE); + *p = cpu_to_be32(create->ftype); switch (create->ftype) { case NF4LNK: - RESERVE_SPACE(4); - WRITE32(create->u.symlink.len); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(create->u.symlink.len); xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len); break; case NF4BLK: case NF4CHR: - RESERVE_SPACE(8); - WRITE32(create->u.device.specdata1); - WRITE32(create->u.device.specdata2); + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(create->u.device.specdata1); + *p = cpu_to_be32(create->u.device.specdata2); break; default: break; } - RESERVE_SPACE(4 + create->name->len); - WRITE32(create->name->len); - WRITEMEM(create->name->name, create->name->len); + encode_string(xdr, create->name->len, create->name->name); hdr->nops++; hdr->replen += decode_create_maxsz; @@ -969,10 +947,10 @@ static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct c { __be32 *p; - RESERVE_SPACE(12); - WRITE32(OP_GETATTR); - WRITE32(1); - WRITE32(bitmap); + p = reserve_space(xdr, 12); + *p++ = cpu_to_be32(OP_GETATTR); + *p++ = cpu_to_be32(1); + *p = cpu_to_be32(bitmap); hdr->nops++; hdr->replen += decode_getattr_maxsz; } @@ -981,11 +959,11 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm { __be32 *p; - RESERVE_SPACE(16); - WRITE32(OP_GETATTR); - WRITE32(2); - WRITE32(bm0); - WRITE32(bm1); + p = reserve_space(xdr, 16); + *p++ = cpu_to_be32(OP_GETATTR); + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(bm0); + *p = cpu_to_be32(bm1); hdr->nops++; hdr->replen += decode_getattr_maxsz; } @@ -1012,8 +990,8 @@ static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(4); - WRITE32(OP_GETFH); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_GETFH); hdr->nops++; hdr->replen += decode_getfh_maxsz; } @@ -1022,10 +1000,9 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct { __be32 *p; - RESERVE_SPACE(8 + name->len); - WRITE32(OP_LINK); - WRITE32(name->len); - WRITEMEM(name->name, name->len); + p = reserve_space(xdr, 8 + name->len); + *p++ = cpu_to_be32(OP_LINK); + xdr_encode_opaque(p, name->name, name->len); hdr->nops++; hdr->replen += decode_link_maxsz; } @@ -1052,27 +1029,27 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args { __be32 *p; - RESERVE_SPACE(32); - WRITE32(OP_LOCK); - WRITE32(nfs4_lock_type(args->fl, args->block)); - WRITE32(args->reclaim); - WRITE64(args->fl->fl_start); - WRITE64(nfs4_lock_length(args->fl)); - WRITE32(args->new_lock_owner); + p = reserve_space(xdr, 32); + *p++ = cpu_to_be32(OP_LOCK); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block)); + *p++ = cpu_to_be32(args->reclaim); + p = xdr_encode_hyper(p, args->fl->fl_start); + p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); + *p = cpu_to_be32(args->new_lock_owner); if (args->new_lock_owner){ - RESERVE_SPACE(4+NFS4_STATEID_SIZE+32); - WRITE32(args->open_seqid->sequence->counter); - WRITEMEM(args->open_stateid->data, NFS4_STATEID_SIZE); - WRITE32(args->lock_seqid->sequence->counter); - WRITE64(args->lock_owner.clientid); - WRITE32(16); - WRITEMEM("lock id:", 8); - WRITE64(args->lock_owner.id); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+32); + *p++ = cpu_to_be32(args->open_seqid->sequence->counter); + p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(args->lock_seqid->sequence->counter); + p = xdr_encode_hyper(p, args->lock_owner.clientid); + *p++ = cpu_to_be32(16); + p = xdr_encode_opaque_fixed(p, "lock id:", 8); + xdr_encode_hyper(p, args->lock_owner.id); } else { - RESERVE_SPACE(NFS4_STATEID_SIZE+4); - WRITEMEM(args->lock_stateid->data, NFS4_STATEID_SIZE); - WRITE32(args->lock_seqid->sequence->counter); + p = reserve_space(xdr, NFS4_STATEID_SIZE+4); + p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(args->lock_seqid->sequence->counter); } hdr->nops++; hdr->replen += decode_lock_maxsz; @@ -1082,15 +1059,15 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar { __be32 *p; - RESERVE_SPACE(52); - WRITE32(OP_LOCKT); - WRITE32(nfs4_lock_type(args->fl, 0)); - WRITE64(args->fl->fl_start); - WRITE64(nfs4_lock_length(args->fl)); - WRITE64(args->lock_owner.clientid); - WRITE32(16); - WRITEMEM("lock id:", 8); - WRITE64(args->lock_owner.id); + p = reserve_space(xdr, 52); + *p++ = cpu_to_be32(OP_LOCKT); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); + p = xdr_encode_hyper(p, args->fl->fl_start); + p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); + p = xdr_encode_hyper(p, args->lock_owner.clientid); + *p++ = cpu_to_be32(16); + p = xdr_encode_opaque_fixed(p, "lock id:", 8); + xdr_encode_hyper(p, args->lock_owner.id); hdr->nops++; hdr->replen += decode_lockt_maxsz; } @@ -1099,13 +1076,13 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar { __be32 *p; - RESERVE_SPACE(12+NFS4_STATEID_SIZE+16); - WRITE32(OP_LOCKU); - WRITE32(nfs4_lock_type(args->fl, 0)); - WRITE32(args->seqid->sequence->counter); - WRITEMEM(args->stateid->data, NFS4_STATEID_SIZE); - WRITE64(args->fl->fl_start); - WRITE64(nfs4_lock_length(args->fl)); + p = reserve_space(xdr, 12+NFS4_STATEID_SIZE+16); + *p++ = cpu_to_be32(OP_LOCKU); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); + *p++ = cpu_to_be32(args->seqid->sequence->counter); + p = xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE); + p = xdr_encode_hyper(p, args->fl->fl_start); + xdr_encode_hyper(p, nfs4_lock_length(args->fl)); hdr->nops++; hdr->replen += decode_locku_maxsz; } @@ -1115,10 +1092,9 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc int len = name->len; __be32 *p; - RESERVE_SPACE(8 + len); - WRITE32(OP_LOOKUP); - WRITE32(len); - WRITEMEM(name->name, len); + p = reserve_space(xdr, 8 + len); + *p++ = cpu_to_be32(OP_LOOKUP); + xdr_encode_opaque(p, name->name, len); hdr->nops++; hdr->replen += decode_lookup_maxsz; } @@ -1127,21 +1103,21 @@ static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode) { __be32 *p; - RESERVE_SPACE(8); + p = reserve_space(xdr, 8); switch (fmode & (FMODE_READ|FMODE_WRITE)) { case FMODE_READ: - WRITE32(NFS4_SHARE_ACCESS_READ); + *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_READ); break; case FMODE_WRITE: - WRITE32(NFS4_SHARE_ACCESS_WRITE); + *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_WRITE); break; case FMODE_READ|FMODE_WRITE: - WRITE32(NFS4_SHARE_ACCESS_BOTH); + *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_BOTH); break; default: - WRITE32(0); + *p++ = cpu_to_be32(0); } - WRITE32(0); /* for linux, share_deny = 0 always */ + *p = cpu_to_be32(0); /* for linux, share_deny = 0 always */ } static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg) @@ -1151,29 +1127,29 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4, * owner 4 = 32 */ - RESERVE_SPACE(8); - WRITE32(OP_OPEN); - WRITE32(arg->seqid->sequence->counter); + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(OP_OPEN); + *p = cpu_to_be32(arg->seqid->sequence->counter); encode_share_access(xdr, arg->fmode); - RESERVE_SPACE(28); - WRITE64(arg->clientid); - WRITE32(16); - WRITEMEM("open id:", 8); - WRITE64(arg->id); + p = reserve_space(xdr, 28); + p = xdr_encode_hyper(p, arg->clientid); + *p++ = cpu_to_be32(16); + p = xdr_encode_opaque_fixed(p, "open id:", 8); + xdr_encode_hyper(p, arg->id); } static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg) { __be32 *p; - RESERVE_SPACE(4); + p = reserve_space(xdr, 4); switch(arg->open_flags & O_EXCL) { case 0: - WRITE32(NFS4_CREATE_UNCHECKED); + *p = cpu_to_be32(NFS4_CREATE_UNCHECKED); encode_attrs(xdr, arg->u.attrs, arg->server); break; default: - WRITE32(NFS4_CREATE_EXCLUSIVE); + *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); encode_nfs4_verifier(xdr, &arg->u.verifier); } } @@ -1182,14 +1158,14 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a { __be32 *p; - RESERVE_SPACE(4); + p = reserve_space(xdr, 4); switch (arg->open_flags & O_CREAT) { case 0: - WRITE32(NFS4_OPEN_NOCREATE); + *p = cpu_to_be32(NFS4_OPEN_NOCREATE); break; default: BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL); - WRITE32(NFS4_OPEN_CREATE); + *p = cpu_to_be32(NFS4_OPEN_CREATE); encode_createmode(xdr, arg); } } @@ -1198,16 +1174,16 @@ static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delega { __be32 *p; - RESERVE_SPACE(4); + p = reserve_space(xdr, 4); switch (delegation_type) { case 0: - WRITE32(NFS4_OPEN_DELEGATE_NONE); + *p = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE); break; case FMODE_READ: - WRITE32(NFS4_OPEN_DELEGATE_READ); + *p = cpu_to_be32(NFS4_OPEN_DELEGATE_READ); break; case FMODE_WRITE|FMODE_READ: - WRITE32(NFS4_OPEN_DELEGATE_WRITE); + *p = cpu_to_be32(NFS4_OPEN_DELEGATE_WRITE); break; default: BUG(); @@ -1218,8 +1194,8 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr * { __be32 *p; - RESERVE_SPACE(4); - WRITE32(NFS4_OPEN_CLAIM_NULL); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(NFS4_OPEN_CLAIM_NULL); encode_string(xdr, name->len, name->name); } @@ -1227,8 +1203,8 @@ static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type) { __be32 *p; - RESERVE_SPACE(4); - WRITE32(NFS4_OPEN_CLAIM_PREVIOUS); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(NFS4_OPEN_CLAIM_PREVIOUS); encode_delegation_type(xdr, type); } @@ -1236,9 +1212,9 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc { __be32 *p; - RESERVE_SPACE(4+NFS4_STATEID_SIZE); - WRITE32(NFS4_OPEN_CLAIM_DELEGATE_CUR); - WRITEMEM(stateid->data, NFS4_STATEID_SIZE); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR); + xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); encode_string(xdr, name->len, name->name); } @@ -1267,10 +1243,10 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co { __be32 *p; - RESERVE_SPACE(4+NFS4_STATEID_SIZE+4); - WRITE32(OP_OPEN_CONFIRM); - WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); - WRITE32(arg->seqid->sequence->counter); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); + *p++ = cpu_to_be32(OP_OPEN_CONFIRM); + p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(arg->seqid->sequence->counter); hdr->nops++; hdr->replen += decode_open_confirm_maxsz; } @@ -1279,10 +1255,10 @@ static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_close { __be32 *p; - RESERVE_SPACE(4+NFS4_STATEID_SIZE+4); - WRITE32(OP_OPEN_DOWNGRADE); - WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); - WRITE32(arg->seqid->sequence->counter); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); + *p++ = cpu_to_be32(OP_OPEN_DOWNGRADE); + p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(arg->seqid->sequence->counter); encode_share_access(xdr, arg->fmode); hdr->nops++; hdr->replen += decode_open_downgrade_maxsz; @@ -1294,10 +1270,9 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hd int len = fh->size; __be32 *p; - RESERVE_SPACE(8 + len); - WRITE32(OP_PUTFH); - WRITE32(len); - WRITEMEM(fh->data, len); + p = reserve_space(xdr, 8 + len); + *p++ = cpu_to_be32(OP_PUTFH); + xdr_encode_opaque(p, fh->data, len); hdr->nops++; hdr->replen += decode_putfh_maxsz; } @@ -1306,8 +1281,8 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(4); - WRITE32(OP_PUTROOTFH); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_PUTROOTFH); hdr->nops++; hdr->replen += decode_putrootfh_maxsz; } @@ -1317,26 +1292,26 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context nfs4_stateid stateid; __be32 *p; - RESERVE_SPACE(NFS4_STATEID_SIZE); + p = reserve_space(xdr, NFS4_STATEID_SIZE); if (ctx->state != NULL) { nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner); - WRITEMEM(stateid.data, NFS4_STATEID_SIZE); + xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE); } else - WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE); + xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); } static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(4); - WRITE32(OP_READ); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_READ); encode_stateid(xdr, args->context); - RESERVE_SPACE(12); - WRITE64(args->offset); - WRITE32(args->count); + p = reserve_space(xdr, 12); + p = xdr_encode_hyper(p, args->offset); + *p = cpu_to_be32(args->count); hdr->nops++; hdr->replen += decode_read_maxsz; } @@ -1349,20 +1324,20 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg }; __be32 *p; - RESERVE_SPACE(12+NFS4_VERIFIER_SIZE+20); - WRITE32(OP_READDIR); - WRITE64(readdir->cookie); - WRITEMEM(readdir->verifier.data, NFS4_VERIFIER_SIZE); - WRITE32(readdir->count >> 1); /* We're not doing readdirplus */ - WRITE32(readdir->count); - WRITE32(2); + p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20); + *p++ = cpu_to_be32(OP_READDIR); + p = xdr_encode_hyper(p, readdir->cookie); + p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE); + *p++ = cpu_to_be32(readdir->count >> 1); /* We're not doing readdirplus */ + *p++ = cpu_to_be32(readdir->count); + *p++ = cpu_to_be32(2); /* Switch to mounted_on_fileid if the server supports it */ if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID) attrs[0] &= ~FATTR4_WORD0_FILEID; else attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; - WRITE32(attrs[0] & readdir->bitmask[0]); - WRITE32(attrs[1] & readdir->bitmask[1]); + *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]); + *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]); hdr->nops++; hdr->replen += decode_readdir_maxsz; dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n", @@ -1378,8 +1353,8 @@ static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink * { __be32 *p; - RESERVE_SPACE(4); - WRITE32(OP_READLINK); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_READLINK); hdr->nops++; hdr->replen += decode_readlink_maxsz; } @@ -1388,10 +1363,9 @@ static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struc { __be32 *p; - RESERVE_SPACE(8 + name->len); - WRITE32(OP_REMOVE); - WRITE32(name->len); - WRITEMEM(name->name, name->len); + p = reserve_space(xdr, 8 + name->len); + *p++ = cpu_to_be32(OP_REMOVE); + xdr_encode_opaque(p, name->name, name->len); hdr->nops++; hdr->replen += decode_remove_maxsz; } @@ -1400,14 +1374,10 @@ static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, co { __be32 *p; - RESERVE_SPACE(8 + oldname->len); - WRITE32(OP_RENAME); - WRITE32(oldname->len); - WRITEMEM(oldname->name, oldname->len); - - RESERVE_SPACE(4 + newname->len); - WRITE32(newname->len); - WRITEMEM(newname->name, newname->len); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_RENAME); + encode_string(xdr, oldname->len, oldname->name); + encode_string(xdr, newname->len, newname->name); hdr->nops++; hdr->replen += decode_rename_maxsz; } @@ -1416,9 +1386,9 @@ static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client { __be32 *p; - RESERVE_SPACE(12); - WRITE32(OP_RENEW); - WRITE64(client_stateid->cl_clientid); + p = reserve_space(xdr, 12); + *p++ = cpu_to_be32(OP_RENEW); + xdr_encode_hyper(p, client_stateid->cl_clientid); hdr->nops++; hdr->replen += decode_renew_maxsz; } @@ -1428,8 +1398,8 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(4); - WRITE32(OP_RESTOREFH); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_RESTOREFH); hdr->nops++; hdr->replen += decode_restorefh_maxsz; } @@ -1439,16 +1409,16 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun { __be32 *p; - RESERVE_SPACE(4+NFS4_STATEID_SIZE); - WRITE32(OP_SETATTR); - WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE); - RESERVE_SPACE(2*4); - WRITE32(1); - WRITE32(FATTR4_WORD0_ACL); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_SETATTR); + xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); + p = reserve_space(xdr, 2*4); + *p++ = cpu_to_be32(1); + *p = cpu_to_be32(FATTR4_WORD0_ACL); if (arg->acl_len % 4) return -EINVAL; - RESERVE_SPACE(4); - WRITE32(arg->acl_len); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(arg->acl_len); xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len); hdr->nops++; hdr->replen += decode_setacl_maxsz; @@ -1460,8 +1430,8 @@ encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(4); - WRITE32(OP_SAVEFH); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_SAVEFH); hdr->nops++; hdr->replen += decode_savefh_maxsz; } @@ -1470,9 +1440,9 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs { __be32 *p; - RESERVE_SPACE(4+NFS4_STATEID_SIZE); - WRITE32(OP_SETATTR); - WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_SETATTR); + xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE); hdr->nops++; hdr->replen += decode_setattr_maxsz; encode_attrs(xdr, arg->iap, server); @@ -1482,17 +1452,17 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie { __be32 *p; - RESERVE_SPACE(4 + NFS4_VERIFIER_SIZE); - WRITE32(OP_SETCLIENTID); - WRITEMEM(setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE); + p = reserve_space(xdr, 4 + NFS4_VERIFIER_SIZE); + *p++ = cpu_to_be32(OP_SETCLIENTID); + xdr_encode_opaque_fixed(p, setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE); encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name); - RESERVE_SPACE(4); - WRITE32(setclientid->sc_prog); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(setclientid->sc_prog); encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid); encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr); - RESERVE_SPACE(4); - WRITE32(setclientid->sc_cb_ident); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(setclientid->sc_cb_ident); hdr->nops++; hdr->replen += decode_setclientid_maxsz; } @@ -1501,10 +1471,10 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_ { __be32 *p; - RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE); - WRITE32(OP_SETCLIENTID_CONFIRM); - WRITE64(client_state->cl_clientid); - WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE); + p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE); + *p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM); + p = xdr_encode_hyper(p, client_state->cl_clientid); + xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE); hdr->nops++; hdr->replen += decode_setclientid_confirm_maxsz; } @@ -1513,15 +1483,15 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg { __be32 *p; - RESERVE_SPACE(4); - WRITE32(OP_WRITE); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_WRITE); encode_stateid(xdr, args->context); - RESERVE_SPACE(16); - WRITE64(args->offset); - WRITE32(args->stable); - WRITE32(args->count); + p = reserve_space(xdr, 16); + p = xdr_encode_hyper(p, args->offset); + *p++ = cpu_to_be32(args->stable); + *p = cpu_to_be32(args->count); xdr_write_pages(xdr, args->pages, args->pgbase, args->count); hdr->nops++; @@ -1532,10 +1502,10 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state { __be32 *p; - RESERVE_SPACE(4+NFS4_STATEID_SIZE); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); - WRITE32(OP_DELEGRETURN); - WRITEMEM(stateid->data, NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_DELEGRETURN); + xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); hdr->nops++; hdr->replen += decode_delegreturn_maxsz; } @@ -1548,16 +1518,16 @@ static void encode_exchange_id(struct xdr_stream *xdr, { __be32 *p; - RESERVE_SPACE(4 + sizeof(args->verifier->data)); - WRITE32(OP_EXCHANGE_ID); - WRITEMEM(args->verifier->data, sizeof(args->verifier->data)); + p = reserve_space(xdr, 4 + sizeof(args->verifier->data)); + *p++ = cpu_to_be32(OP_EXCHANGE_ID); + xdr_encode_opaque_fixed(p, args->verifier->data, sizeof(args->verifier->data)); encode_string(xdr, args->id_len, args->id); - RESERVE_SPACE(12); - WRITE32(args->flags); - WRITE32(0); /* zero length state_protect4_a */ - WRITE32(0); /* zero length implementation id array */ + p = reserve_space(xdr, 12); + *p++ = cpu_to_be32(args->flags); + *p++ = cpu_to_be32(0); /* zero length state_protect4_a */ + *p = cpu_to_be32(0); /* zero length implementation id array */ hdr->nops++; hdr->replen += decode_exchange_id_maxsz; } @@ -1571,55 +1541,43 @@ static void encode_create_session(struct xdr_stream *xdr, uint32_t len; struct nfs_client *clp = args->client; - RESERVE_SPACE(4); - WRITE32(OP_CREATE_SESSION); - - RESERVE_SPACE(8); - WRITE64(clp->cl_ex_clid); + len = scnprintf(machine_name, sizeof(machine_name), "%s", + clp->cl_ipaddr); - RESERVE_SPACE(8); - WRITE32(clp->cl_seqid); /*Sequence id */ - WRITE32(args->flags); /*flags */ + p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12); + *p++ = cpu_to_be32(OP_CREATE_SESSION); + p = xdr_encode_hyper(p, clp->cl_ex_clid); + *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */ + *p++ = cpu_to_be32(args->flags); /*flags */ - RESERVE_SPACE(2*28); /* 2 channel_attrs */ /* Fore Channel */ - WRITE32(args->fc_attrs.headerpadsz); /* header padding size */ - WRITE32(args->fc_attrs.max_rqst_sz); /* max req size */ - WRITE32(args->fc_attrs.max_resp_sz); /* max resp size */ - WRITE32(args->fc_attrs.max_resp_sz_cached); /* Max resp sz cached */ - WRITE32(args->fc_attrs.max_ops); /* max operations */ - WRITE32(args->fc_attrs.max_reqs); /* max requests */ - WRITE32(0); /* rdmachannel_attrs */ + *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */ + *p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz); /* max req size */ + *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz); /* max resp size */ + *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz_cached); /* Max resp sz cached */ + *p++ = cpu_to_be32(args->fc_attrs.max_ops); /* max operations */ + *p++ = cpu_to_be32(args->fc_attrs.max_reqs); /* max requests */ + *p++ = cpu_to_be32(0); /* rdmachannel_attrs */ /* Back Channel */ - WRITE32(args->fc_attrs.headerpadsz); /* header padding size */ - WRITE32(args->bc_attrs.max_rqst_sz); /* max req size */ - WRITE32(args->bc_attrs.max_resp_sz); /* max resp size */ - WRITE32(args->bc_attrs.max_resp_sz_cached); /* Max resp sz cached */ - WRITE32(args->bc_attrs.max_ops); /* max operations */ - WRITE32(args->bc_attrs.max_reqs); /* max requests */ - WRITE32(0); /* rdmachannel_attrs */ - - RESERVE_SPACE(4); - WRITE32(args->cb_program); /* cb_program */ - - RESERVE_SPACE(4); /* # of security flavors */ - WRITE32(1); - - RESERVE_SPACE(4); - WRITE32(RPC_AUTH_UNIX); /* auth_sys */ + *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */ + *p++ = cpu_to_be32(args->bc_attrs.max_rqst_sz); /* max req size */ + *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz); /* max resp size */ + *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz_cached); /* Max resp sz cached */ + *p++ = cpu_to_be32(args->bc_attrs.max_ops); /* max operations */ + *p++ = cpu_to_be32(args->bc_attrs.max_reqs); /* max requests */ + *p++ = cpu_to_be32(0); /* rdmachannel_attrs */ + + *p++ = cpu_to_be32(args->cb_program); /* cb_program */ + *p++ = cpu_to_be32(1); + *p++ = cpu_to_be32(RPC_AUTH_UNIX); /* auth_sys */ /* authsys_parms rfc1831 */ - RESERVE_SPACE(4); - WRITE32((u32)clp->cl_boot_time.tv_nsec); /* stamp */ - len = scnprintf(machine_name, sizeof(machine_name), "%s", - clp->cl_ipaddr); - RESERVE_SPACE(16 + len); - WRITE32(len); - WRITEMEM(machine_name, len); - WRITE32(0); /* UID */ - WRITE32(0); /* GID */ - WRITE32(0); /* No more gids */ + *p++ = cpu_to_be32((u32)clp->cl_boot_time.tv_nsec); /* stamp */ + p = xdr_encode_opaque(p, machine_name, len); + *p++ = cpu_to_be32(0); /* UID */ + *p++ = cpu_to_be32(0); /* GID */ + *p = cpu_to_be32(0); /* No more gids */ hdr->nops++; hdr->replen += decode_create_session_maxsz; } @@ -1629,9 +1587,9 @@ static void encode_destroy_session(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(4 + NFS4_MAX_SESSIONID_LEN); - WRITE32(OP_DESTROY_SESSION); - WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN); + p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN); + *p++ = cpu_to_be32(OP_DESTROY_SESSION); + xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); hdr->nops++; hdr->replen += decode_destroy_session_maxsz; } @@ -1655,8 +1613,8 @@ static void encode_sequence(struct xdr_stream *xdr, WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE); slot = tp->slots + args->sa_slotid; - RESERVE_SPACE(4); - WRITE32(OP_SEQUENCE); + p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN + 16); + *p++ = cpu_to_be32(OP_SEQUENCE); /* * Sessionid + seqid + slotid + max slotid + cache_this @@ -1670,12 +1628,11 @@ static void encode_sequence(struct xdr_stream *xdr, ((u32 *)session->sess_id.data)[3], slot->seq_nr, args->sa_slotid, tp->highest_used_slotid, args->sa_cache_this); - RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 16); - WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN); - WRITE32(slot->seq_nr); - WRITE32(args->sa_slotid); - WRITE32(tp->highest_used_slotid); - WRITE32(args->sa_cache_this); + p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); + *p++ = cpu_to_be32(slot->seq_nr); + *p++ = cpu_to_be32(args->sa_slotid); + *p++ = cpu_to_be32(tp->highest_used_slotid); + *p = cpu_to_be32(args->sa_cache_this); hdr->nops++; hdr->replen += decode_sequence_maxsz; #endif /* CONFIG_NFS_V4_1 */ @@ -2466,68 +2423,53 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p, } #endif /* CONFIG_NFS_V4_1 */ -/* - * START OF "GENERIC" DECODE ROUTINES. - * These may look a little ugly since they are imported from a "generic" - * set of XDR encode/decode routines which are intended to be shared by - * all of our NFSv4 implementations (OpenBSD, MacOS X...). - * - * If the pain of reading these is too great, it should be a straightforward - * task to translate them into Linux-specific versions which are more - * consistent with the style used in NFSv2/v3... - */ -#define READ32(x) (x) = ntohl(*p++) -#define READ64(x) do { \ - (x) = (u64)ntohl(*p++) << 32; \ - (x) |= ntohl(*p++); \ -} while (0) -#define READTIME(x) do { \ - p++; \ - (x.tv_sec) = ntohl(*p++); \ - (x.tv_nsec) = ntohl(*p++); \ -} while (0) -#define COPYMEM(x,nbytes) do { \ - memcpy((x), p, nbytes); \ - p += XDR_QUADLEN(nbytes); \ -} while (0) - -#define READ_BUF(nbytes) do { \ - p = xdr_inline_decode(xdr, nbytes); \ - if (unlikely(!p)) { \ - dprintk("nfs: %s: prematurely hit end of receive" \ - " buffer\n", __func__); \ - dprintk("nfs: %s: xdr->p=%p, bytes=%u, xdr->end=%p\n", \ - __func__, xdr->p, nbytes, xdr->end); \ - return -EIO; \ - } \ -} while (0) +static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +{ + dprintk("nfs: %s: prematurely hit end of receive buffer. " + "Remaining buffer length is %tu words.\n", + func, xdr->end - xdr->p); +} static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string) { __be32 *p; - READ_BUF(4); - READ32(*len); - READ_BUF(*len); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, *len); + if (unlikely(!p)) + goto out_overflow; *string = (char *)p; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - READ_BUF(8); - READ32(hdr->status); - READ32(hdr->taglen); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + hdr->status = be32_to_cpup(p++); + hdr->taglen = be32_to_cpup(p); - READ_BUF(hdr->taglen + 4); + p = xdr_inline_decode(xdr, hdr->taglen + 4); + if (unlikely(!p)) + goto out_overflow; hdr->tag = (char *)p; p += XDR_QUADLEN(hdr->taglen); - READ32(hdr->nops); + hdr->nops = be32_to_cpup(p); if (unlikely(hdr->nops < 1)) return nfs4_stat_to_errno(hdr->status); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) @@ -2536,18 +2478,23 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) uint32_t opnum; int32_t nfserr; - READ_BUF(8); - READ32(opnum); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + opnum = be32_to_cpup(p++); if (opnum != expected) { dprintk("nfs: Server returned operation" " %d but we issued a request for %d\n", opnum, expected); return -EIO; } - READ32(nfserr); + nfserr = be32_to_cpup(p); if (nfserr != NFS_OK) return nfs4_stat_to_errno(nfserr); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } /* Dummy routine */ @@ -2557,8 +2504,11 @@ static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs_client *clp) unsigned int strlen; char *str; - READ_BUF(12); - return decode_opaque_inline(xdr, &strlen, &str); + p = xdr_inline_decode(xdr, 12); + if (likely(p)) + return decode_opaque_inline(xdr, &strlen, &str); + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) @@ -2566,27 +2516,39 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) uint32_t bmlen; __be32 *p; - READ_BUF(4); - READ32(bmlen); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + bmlen = be32_to_cpup(p); bitmap[0] = bitmap[1] = 0; - READ_BUF((bmlen << 2)); + p = xdr_inline_decode(xdr, (bmlen << 2)); + if (unlikely(!p)) + goto out_overflow; if (bmlen > 0) { - READ32(bitmap[0]); + bitmap[0] = be32_to_cpup(p++); if (bmlen > 1) - READ32(bitmap[1]); + bitmap[1] = be32_to_cpup(p); } return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, __be32 **savep) { __be32 *p; - READ_BUF(4); - READ32(*attrlen); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *attrlen = be32_to_cpup(p); *savep = xdr->p; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask) @@ -2609,8 +2571,10 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t * if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) { - READ_BUF(4); - READ32(*type); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *type = be32_to_cpup(p); if (*type < NF4REG || *type > NF4NAMEDATTR) { dprintk("%s: bad type %d\n", __func__, *type); return -EIO; @@ -2620,6 +2584,9 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t * } dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change) @@ -2631,14 +2598,19 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) { - READ_BUF(8); - READ64(*change); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, change); bitmap[0] &= ~FATTR4_WORD0_CHANGE; ret = NFS_ATTR_FATTR_CHANGE; } dprintk("%s: change attribute=%Lu\n", __func__, (unsigned long long)*change); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size) @@ -2650,13 +2622,18 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t * if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) { - READ_BUF(8); - READ64(*size); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, size); bitmap[0] &= ~FATTR4_WORD0_SIZE; ret = NFS_ATTR_FATTR_SIZE; } dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -2667,12 +2644,17 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui if (unlikely(bitmap[0] & (FATTR4_WORD0_LINK_SUPPORT - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) { - READ_BUF(4); - READ32(*res); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT; } dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true"); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -2683,12 +2665,17 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, if (unlikely(bitmap[0] & (FATTR4_WORD0_SYMLINK_SUPPORT - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) { - READ_BUF(4); - READ32(*res); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT; } dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true"); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid) @@ -2701,9 +2688,11 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs if (unlikely(bitmap[0] & (FATTR4_WORD0_FSID - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FSID)) { - READ_BUF(16); - READ64(fsid->major); - READ64(fsid->minor); + p = xdr_inline_decode(xdr, 16); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &fsid->major); + xdr_decode_hyper(p, &fsid->minor); bitmap[0] &= ~FATTR4_WORD0_FSID; ret = NFS_ATTR_FATTR_FSID; } @@ -2711,6 +2700,9 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs (unsigned long long)fsid->major, (unsigned long long)fsid->minor); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -2721,12 +2713,17 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint if (unlikely(bitmap[0] & (FATTR4_WORD0_LEASE_TIME - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) { - READ_BUF(4); - READ32(*res); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME; } dprintk("%s: file size=%u\n", __func__, (unsigned int)*res); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -2737,12 +2734,17 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) { - READ_BUF(4); - READ32(*res); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT; } dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) @@ -2754,13 +2756,18 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) { - READ_BUF(8); - READ64(*fileid); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, fileid); bitmap[0] &= ~FATTR4_WORD0_FILEID; ret = NFS_ATTR_FATTR_FILEID; } dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) @@ -2772,13 +2779,18 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) { - READ_BUF(8); - READ64(*fileid); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, fileid); bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; ret = NFS_ATTR_FATTR_FILEID; } dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -2790,12 +2802,17 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_AVAIL - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL; } dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -2807,12 +2824,17 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_FREE - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_FREE; } dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -2824,12 +2846,17 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_TOTAL - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL; } dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) @@ -2838,8 +2865,10 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) __be32 *p; int status = 0; - READ_BUF(4); - READ32(n); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + n = be32_to_cpup(p); if (n == 0) goto root_path; dprintk("path "); @@ -2873,6 +2902,9 @@ out_eio: dprintk(" status %d", status); status = -EIO; goto out; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res) @@ -2890,8 +2922,10 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st status = decode_pathname(xdr, &res->fs_path); if (unlikely(status != 0)) goto out; - READ_BUF(4); - READ32(n); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + n = be32_to_cpup(p); if (n <= 0) goto out_eio; res->nlocations = 0; @@ -2899,8 +2933,10 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st u32 m; struct nfs4_fs_location *loc = &res->locations[res->nlocations]; - READ_BUF(4); - READ32(m); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + m = be32_to_cpup(p); loc->nservers = 0; dprintk("%s: servers ", __func__); @@ -2939,6 +2975,8 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st out: dprintk("%s: fs_locations done, error = %d\n", __func__, status); return status; +out_overflow: + print_overflow_msg(__func__, xdr); out_eio: status = -EIO; goto out; @@ -2953,12 +2991,17 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXFILESIZE - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE; } dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink) @@ -2970,12 +3013,17 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXLINK - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) { - READ_BUF(4); - READ32(*maxlink); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *maxlink = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_MAXLINK; } dprintk("%s: maxlink=%u\n", __func__, *maxlink); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname) @@ -2987,12 +3035,17 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXNAME - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) { - READ_BUF(4); - READ32(*maxname); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *maxname = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_MAXNAME; } dprintk("%s: maxname=%u\n", __func__, *maxname); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -3005,8 +3058,10 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_MAXREAD)) { uint64_t maxread; - READ_BUF(8); - READ64(maxread); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, &maxread); if (maxread > 0x7FFFFFFF) maxread = 0x7FFFFFFF; *res = (uint32_t)maxread; @@ -3014,6 +3069,9 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ } dprintk("%s: maxread=%lu\n", __func__, (unsigned long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -3026,8 +3084,10 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32 return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_MAXWRITE)) { uint64_t maxwrite; - READ_BUF(8); - READ64(maxwrite); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, &maxwrite); if (maxwrite > 0x7FFFFFFF) maxwrite = 0x7FFFFFFF; *res = (uint32_t)maxwrite; @@ -3035,6 +3095,9 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32 } dprintk("%s: maxwrite=%lu\n", __func__, (unsigned long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode) @@ -3047,14 +3110,19 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_MODE)) { - READ_BUF(4); - READ32(tmp); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + tmp = be32_to_cpup(p); *mode = tmp & ~S_IFMT; bitmap[1] &= ~FATTR4_WORD1_MODE; ret = NFS_ATTR_FATTR_MODE; } dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink) @@ -3066,16 +3134,22 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) { - READ_BUF(4); - READ32(*nlink); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *nlink = be32_to_cpup(p); bitmap[1] &= ~FATTR4_WORD1_NUMLINKS; ret = NFS_ATTR_FATTR_NLINK; } dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } -static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid) +static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, + struct nfs_client *clp, uint32_t *uid, int may_sleep) { uint32_t len; __be32 *p; @@ -3085,10 +3159,16 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_OWNER)) { - READ_BUF(4); - READ32(len); - READ_BUF(len); - if (len < XDR_MAX_NETOBJ) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; + if (!may_sleep) { + /* do nothing */ + } else if (len < XDR_MAX_NETOBJ) { if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0) ret = NFS_ATTR_FATTR_OWNER; else @@ -3101,9 +3181,13 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf } dprintk("%s: uid=%d\n", __func__, (int)*uid); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } -static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid) +static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, + struct nfs_client *clp, uint32_t *gid, int may_sleep) { uint32_t len; __be32 *p; @@ -3113,10 +3197,16 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) { - READ_BUF(4); - READ32(len); - READ_BUF(len); - if (len < XDR_MAX_NETOBJ) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; + if (!may_sleep) { + /* do nothing */ + } else if (len < XDR_MAX_NETOBJ) { if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0) ret = NFS_ATTR_FATTR_GROUP; else @@ -3129,6 +3219,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf } dprintk("%s: gid=%d\n", __func__, (int)*gid); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev) @@ -3143,9 +3236,11 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde if (likely(bitmap[1] & FATTR4_WORD1_RAWDEV)) { dev_t tmp; - READ_BUF(8); - READ32(major); - READ32(minor); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + major = be32_to_cpup(p++); + minor = be32_to_cpup(p); tmp = MKDEV(major, minor); if (MAJOR(tmp) == major && MINOR(tmp) == minor) *rdev = tmp; @@ -3154,6 +3249,9 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde } dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -3165,12 +3263,17 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_AVAIL - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL; } dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -3182,12 +3285,17 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_FREE - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE; } dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -3199,12 +3307,17 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_TOTAL - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL; } dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used) @@ -3216,14 +3329,19 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) { - READ_BUF(8); - READ64(*used); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, used); bitmap[1] &= ~FATTR4_WORD1_SPACE_USED; ret = NFS_ATTR_FATTR_SPACE_USED; } dprintk("%s: space used=%Lu\n", __func__, (unsigned long long)*used); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) @@ -3232,12 +3350,17 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) uint64_t sec; uint32_t nsec; - READ_BUF(12); - READ64(sec); - READ32(nsec); + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &sec); + nsec = be32_to_cpup(p); time->tv_sec = (time_t)sec; time->tv_nsec = (long)nsec; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) @@ -3315,11 +3438,16 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c { __be32 *p; - READ_BUF(20); - READ32(cinfo->atomic); - READ64(cinfo->before); - READ64(cinfo->after); + p = xdr_inline_decode(xdr, 20); + if (unlikely(!p)) + goto out_overflow; + cinfo->atomic = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &cinfo->before); + xdr_decode_hyper(p, &cinfo->after); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access) @@ -3331,40 +3459,62 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access) status = decode_op_hdr(xdr, OP_ACCESS); if (status) return status; - READ_BUF(8); - READ32(supp); - READ32(acc); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + supp = be32_to_cpup(p++); + acc = be32_to_cpup(p); access->supported = supp; access->access = acc; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } -static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) +static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len) { __be32 *p; + + p = xdr_inline_decode(xdr, len); + if (likely(p)) { + memcpy(buf, p, len); + return 0; + } + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + return decode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE); +} + +static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) +{ int status; status = decode_op_hdr(xdr, OP_CLOSE); if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); - if (status) - return status; - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); - return 0; + if (!status) + status = decode_stateid(xdr, &res->stateid); + return status; +} + +static int decode_verifier(struct xdr_stream *xdr, void *verifier) +{ + return decode_opaque_fixed(xdr, verifier, 8); } static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res) { - __be32 *p; int status; status = decode_op_hdr(xdr, OP_COMMIT); - if (status) - return status; - READ_BUF(8); - COPYMEM(res->verf->verifier, 8); - return 0; + if (!status) + status = decode_verifier(xdr, res->verf->verifier); + return status; } static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) @@ -3378,10 +3528,16 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) return status; if ((status = decode_change_info(xdr, cinfo))) return status; - READ_BUF(4); - READ32(bmlen); - READ_BUF(bmlen << 2); - return 0; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + bmlen = be32_to_cpup(p); + p = xdr_inline_decode(xdr, bmlen << 2); + if (likely(p)) + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res) @@ -3466,7 +3622,8 @@ xdr_error: return status; } -static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, const struct nfs_server *server) +static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, + const struct nfs_server *server, int may_sleep) { __be32 *savep; uint32_t attrlen, @@ -3538,12 +3695,14 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons goto xdr_error; fattr->valid |= status; - status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid); + status = decode_attr_owner(xdr, bitmap, server->nfs_client, + &fattr->uid, may_sleep); if (status < 0) goto xdr_error; fattr->valid |= status; - status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid); + status = decode_attr_group(xdr, bitmap, server->nfs_client, + &fattr->gid, may_sleep); if (status < 0) goto xdr_error; fattr->valid |= status; @@ -3633,14 +3792,21 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh) if (status) return status; - READ_BUF(4); - READ32(len); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); if (len > NFS4_FHSIZE) return -EIO; fh->size = len; - READ_BUF(len); - COPYMEM(fh->data, len); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; + memcpy(fh->data, p, len); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) @@ -3662,10 +3828,12 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) __be32 *p; uint32_t namelen, type; - READ_BUF(32); - READ64(offset); - READ64(length); - READ32(type); + p = xdr_inline_decode(xdr, 32); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &offset); + p = xdr_decode_hyper(p, &length); + type = be32_to_cpup(p++); if (fl != NULL) { fl->fl_start = (loff_t)offset; fl->fl_end = fl->fl_start + (loff_t)length - 1; @@ -3676,23 +3844,27 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) fl->fl_type = F_RDLCK; fl->fl_pid = 0; } - READ64(clientid); - READ32(namelen); - READ_BUF(namelen); - return -NFS4ERR_DENIED; + p = xdr_decode_hyper(p, &clientid); + namelen = be32_to_cpup(p); + p = xdr_inline_decode(xdr, namelen); + if (likely(p)) + return -NFS4ERR_DENIED; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res) { - __be32 *p; int status; status = decode_op_hdr(xdr, OP_LOCK); if (status == -EIO) goto out; if (status == 0) { - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); + status = decode_stateid(xdr, &res->stateid); + if (unlikely(status)) + goto out; } else if (status == -NFS4ERR_DENIED) status = decode_lock_denied(xdr, NULL); if (res->open_seqid != NULL) @@ -3713,16 +3885,13 @@ static int decode_lockt(struct xdr_stream *xdr, struct nfs_lockt_res *res) static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res) { - __be32 *p; int status; status = decode_op_hdr(xdr, OP_LOCKU); if (status != -EIO) nfs_increment_lock_seqid(status, res->seqid); - if (status == 0) { - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); - } + if (status == 0) + status = decode_stateid(xdr, &res->stateid); return status; } @@ -3737,34 +3906,46 @@ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize) __be32 *p; uint32_t limit_type, nblocks, blocksize; - READ_BUF(12); - READ32(limit_type); + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + goto out_overflow; + limit_type = be32_to_cpup(p++); switch (limit_type) { case 1: - READ64(*maxsize); + xdr_decode_hyper(p, maxsize); break; case 2: - READ32(nblocks); - READ32(blocksize); + nblocks = be32_to_cpup(p++); + blocksize = be32_to_cpup(p); *maxsize = (uint64_t)nblocks * (uint64_t)blocksize; } return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) { __be32 *p; uint32_t delegation_type; + int status; - READ_BUF(4); - READ32(delegation_type); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + delegation_type = be32_to_cpup(p); if (delegation_type == NFS4_OPEN_DELEGATE_NONE) { res->delegation_type = 0; return 0; } - READ_BUF(NFS4_STATEID_SIZE+4); - COPYMEM(res->delegation.data, NFS4_STATEID_SIZE); - READ32(res->do_recall); + status = decode_stateid(xdr, &res->delegation); + if (unlikely(status)) + return status; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + res->do_recall = be32_to_cpup(p); switch (delegation_type) { case NFS4_OPEN_DELEGATE_READ: @@ -3776,6 +3957,9 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) return -EIO; } return decode_ace(xdr, NULL, res->server->nfs_client); +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) @@ -3787,23 +3971,27 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) status = decode_op_hdr(xdr, OP_OPEN); if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); - if (status) + if (!status) + status = decode_stateid(xdr, &res->stateid); + if (unlikely(status)) return status; - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); decode_change_info(xdr, &res->cinfo); - READ_BUF(8); - READ32(res->rflags); - READ32(bmlen); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + res->rflags = be32_to_cpup(p++); + bmlen = be32_to_cpup(p); if (bmlen > 10) goto xdr_error; - READ_BUF(bmlen << 2); + p = xdr_inline_decode(xdr, bmlen << 2); + if (unlikely(!p)) + goto out_overflow; savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE); for (i = 0; i < savewords; ++i) - READ32(res->attrset[i]); + res->attrset[i] = be32_to_cpup(p++); for (; i < NFS4_BITMAP_SIZE; i++) res->attrset[i] = 0; @@ -3811,36 +3999,33 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) xdr_error: dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen); return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res) { - __be32 *p; int status; status = decode_op_hdr(xdr, OP_OPEN_CONFIRM); if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); - if (status) - return status; - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); - return 0; + if (!status) + status = decode_stateid(xdr, &res->stateid); + return status; } static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res) { - __be32 *p; int status; status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE); if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); - if (status) - return status; - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); - return 0; + if (!status) + status = decode_stateid(xdr, &res->stateid); + return status; } static int decode_putfh(struct xdr_stream *xdr) @@ -3863,9 +4048,11 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_ status = decode_op_hdr(xdr, OP_READ); if (status) return status; - READ_BUF(8); - READ32(eof); - READ32(count); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + eof = be32_to_cpup(p++); + count = be32_to_cpup(p); hdrlen = (u8 *) p - (u8 *) iov->iov_base; recvd = req->rq_rcv_buf.len - hdrlen; if (count > recvd) { @@ -3878,6 +4065,9 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_ res->eof = eof; res->count = count; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir) @@ -3892,17 +4082,17 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n int status; status = decode_op_hdr(xdr, OP_READDIR); - if (status) + if (!status) + status = decode_verifier(xdr, readdir->verifier.data); + if (unlikely(status)) return status; - READ_BUF(8); - COPYMEM(readdir->verifier.data, 8); dprintk("%s: verifier = %08x:%08x\n", __func__, ((u32 *)readdir->verifier.data)[0], ((u32 *)readdir->verifier.data)[1]); - hdrlen = (char *) p - (char *) iov->iov_base; + hdrlen = (char *) xdr->p - (char *) iov->iov_base; recvd = rcvbuf->len - hdrlen; if (pglen > recvd) pglen = recvd; @@ -3990,8 +4180,10 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) return status; /* Convert length of symlink */ - READ_BUF(4); - READ32(len); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); if (len >= rcvbuf->page_len || len <= 0) { dprintk("nfs: server returned giant symlink!\n"); return -ENAMETOOLONG; @@ -4015,6 +4207,9 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) kaddr[len+rcvbuf->page_base] = '\0'; kunmap_atomic(kaddr, KM_USER0); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_remove(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) @@ -4112,10 +4307,16 @@ static int decode_setattr(struct xdr_stream *xdr) status = decode_op_hdr(xdr, OP_SETATTR); if (status) return status; - READ_BUF(4); - READ32(bmlen); - READ_BUF(bmlen << 2); - return 0; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + bmlen = be32_to_cpup(p); + p = xdr_inline_decode(xdr, bmlen << 2); + if (likely(p)) + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) @@ -4124,35 +4325,50 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) uint32_t opnum; int32_t nfserr; - READ_BUF(8); - READ32(opnum); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + opnum = be32_to_cpup(p++); if (opnum != OP_SETCLIENTID) { dprintk("nfs: decode_setclientid: Server returned operation" " %d\n", opnum); return -EIO; } - READ32(nfserr); + nfserr = be32_to_cpup(p); if (nfserr == NFS_OK) { - READ_BUF(8 + NFS4_VERIFIER_SIZE); - READ64(clp->cl_clientid); - COPYMEM(clp->cl_confirm.data, NFS4_VERIFIER_SIZE); + p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &clp->cl_clientid); + memcpy(clp->cl_confirm.data, p, NFS4_VERIFIER_SIZE); } else if (nfserr == NFSERR_CLID_INUSE) { uint32_t len; /* skip netid string */ - READ_BUF(4); - READ32(len); - READ_BUF(len); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; /* skip uaddr string */ - READ_BUF(4); - READ32(len); - READ_BUF(len); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; return -NFSERR_CLID_INUSE; } else return nfs4_stat_to_errno(nfserr); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_setclientid_confirm(struct xdr_stream *xdr) @@ -4169,11 +4385,16 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res) if (status) return status; - READ_BUF(16); - READ32(res->count); - READ32(res->verf->committed); - COPYMEM(res->verf->verifier, 8); + p = xdr_inline_decode(xdr, 16); + if (unlikely(!p)) + goto out_overflow; + res->count = be32_to_cpup(p++); + res->verf->committed = be32_to_cpup(p++); + memcpy(res->verf->verifier, p, 8); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_delegreturn(struct xdr_stream *xdr) @@ -4187,6 +4408,7 @@ static int decode_exchange_id(struct xdr_stream *xdr, { __be32 *p; uint32_t dummy; + char *dummy_str; int status; struct nfs_client *clp = res->client; @@ -4194,36 +4416,45 @@ static int decode_exchange_id(struct xdr_stream *xdr, if (status) return status; - READ_BUF(8); - READ64(clp->cl_ex_clid); - READ_BUF(12); - READ32(clp->cl_seqid); - READ32(clp->cl_exchange_flags); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, &clp->cl_ex_clid); + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + goto out_overflow; + clp->cl_seqid = be32_to_cpup(p++); + clp->cl_exchange_flags = be32_to_cpup(p++); /* We ask for SP4_NONE */ - READ32(dummy); + dummy = be32_to_cpup(p); if (dummy != SP4_NONE) return -EIO; /* Throw away minor_id */ - READ_BUF(8); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; /* Throw away Major id */ - READ_BUF(4); - READ32(dummy); - READ_BUF(dummy); + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; /* Throw away server_scope */ - READ_BUF(4); - READ32(dummy); - READ_BUF(dummy); + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; /* Throw away Implementation id array */ - READ_BUF(4); - READ32(dummy); - READ_BUF(dummy); + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_chan_attrs(struct xdr_stream *xdr, @@ -4232,22 +4463,35 @@ static int decode_chan_attrs(struct xdr_stream *xdr, __be32 *p; u32 nr_attrs; - READ_BUF(28); - READ32(attrs->headerpadsz); - READ32(attrs->max_rqst_sz); - READ32(attrs->max_resp_sz); - READ32(attrs->max_resp_sz_cached); - READ32(attrs->max_ops); - READ32(attrs->max_reqs); - READ32(nr_attrs); + p = xdr_inline_decode(xdr, 28); + if (unlikely(!p)) + goto out_overflow; + attrs->headerpadsz = be32_to_cpup(p++); + attrs->max_rqst_sz = be32_to_cpup(p++); + attrs->max_resp_sz = be32_to_cpup(p++); + attrs->max_resp_sz_cached = be32_to_cpup(p++); + attrs->max_ops = be32_to_cpup(p++); + attrs->max_reqs = be32_to_cpup(p++); + nr_attrs = be32_to_cpup(p); if (unlikely(nr_attrs > 1)) { printk(KERN_WARNING "%s: Invalid rdma channel attrs count %u\n", __func__, nr_attrs); return -EINVAL; } - if (nr_attrs == 1) - READ_BUF(4); /* skip rdma_attrs */ + if (nr_attrs == 1) { + p = xdr_inline_decode(xdr, 4); /* skip rdma_attrs */ + if (unlikely(!p)) + goto out_overflow; + } return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid) +{ + return decode_opaque_fixed(xdr, sid->data, NFS4_MAX_SESSIONID_LEN); } static int decode_create_session(struct xdr_stream *xdr, @@ -4259,24 +4503,26 @@ static int decode_create_session(struct xdr_stream *xdr, struct nfs4_session *session = clp->cl_session; status = decode_op_hdr(xdr, OP_CREATE_SESSION); - - if (status) + if (!status) + status = decode_sessionid(xdr, &session->sess_id); + if (unlikely(status)) return status; - /* sessionid */ - READ_BUF(NFS4_MAX_SESSIONID_LEN); - COPYMEM(&session->sess_id, NFS4_MAX_SESSIONID_LEN); - /* seqid, flags */ - READ_BUF(8); - READ32(clp->cl_seqid); - READ32(session->flags); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + clp->cl_seqid = be32_to_cpup(p++); + session->flags = be32_to_cpup(p); /* Channel attributes */ status = decode_chan_attrs(xdr, &session->fc_attrs); if (!status) status = decode_chan_attrs(xdr, &session->bc_attrs); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_destroy_session(struct xdr_stream *xdr, void *dummy) @@ -4300,7 +4546,9 @@ static int decode_sequence(struct xdr_stream *xdr, return 0; status = decode_op_hdr(xdr, OP_SEQUENCE); - if (status) + if (!status) + status = decode_sessionid(xdr, &id); + if (unlikely(status)) goto out_err; /* @@ -4309,36 +4557,43 @@ static int decode_sequence(struct xdr_stream *xdr, */ status = -ESERVERFAULT; - slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid]; - READ_BUF(NFS4_MAX_SESSIONID_LEN + 20); - COPYMEM(id.data, NFS4_MAX_SESSIONID_LEN); if (memcmp(id.data, res->sr_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) { dprintk("%s Invalid session id\n", __func__); goto out_err; } + + p = xdr_inline_decode(xdr, 20); + if (unlikely(!p)) + goto out_overflow; + /* seqid */ - READ32(dummy); + slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid]; + dummy = be32_to_cpup(p++); if (dummy != slot->seq_nr) { dprintk("%s Invalid sequence number\n", __func__); goto out_err; } /* slot id */ - READ32(dummy); + dummy = be32_to_cpup(p++); if (dummy != res->sr_slotid) { dprintk("%s Invalid slot id\n", __func__); goto out_err; } /* highest slot id - currently not processed */ - READ32(dummy); + dummy = be32_to_cpup(p++); /* target highest slot id - currently not processed */ - READ32(dummy); + dummy = be32_to_cpup(p++); /* result flags - currently not processed */ - READ32(dummy); + dummy = be32_to_cpup(p); status = 0; out_err: res->sr_status = status; return status; +out_overflow: + print_overflow_msg(__func__, xdr); + status = -EIO; + goto out_err; #else /* CONFIG_NFS_V4_1 */ return 0; #endif /* CONFIG_NFS_V4_1 */ @@ -4370,7 +4625,8 @@ static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct status = decode_open_downgrade(&xdr, res); if (status != 0) goto out; - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4397,7 +4653,8 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_ac status = decode_access(&xdr, res); if (status != 0) goto out; - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4424,7 +4681,8 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lo goto out; if ((status = decode_getfh(&xdr, res->fh)) != 0) goto out; - status = decode_getfattr(&xdr, res->fattr, res->server); + status = decode_getfattr(&xdr, res->fattr, res->server + ,!RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4448,7 +4706,8 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nf if ((status = decode_putrootfh(&xdr)) != 0) goto out; if ((status = decode_getfh(&xdr, res->fh)) == 0) - status = decode_getfattr(&xdr, res->fattr, res->server); + status = decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4473,7 +4732,8 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_rem goto out; if ((status = decode_remove(&xdr, &res->cinfo)) != 0) goto out; - decode_getfattr(&xdr, &res->dir_attr, res->server); + decode_getfattr(&xdr, &res->dir_attr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4503,11 +4763,13 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_re if ((status = decode_rename(&xdr, &res->old_cinfo, &res->new_cinfo)) != 0) goto out; /* Current FH is target directory */ - if (decode_getfattr(&xdr, res->new_fattr, res->server) != 0) + if (decode_getfattr(&xdr, res->new_fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)) != 0) goto out; if ((status = decode_restorefh(&xdr)) != 0) goto out; - decode_getfattr(&xdr, res->old_fattr, res->server); + decode_getfattr(&xdr, res->old_fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4540,11 +4802,13 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link * Note order: OP_LINK leaves the directory as the current * filehandle. */ - if (decode_getfattr(&xdr, res->dir_attr, res->server) != 0) + if (decode_getfattr(&xdr, res->dir_attr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)) != 0) goto out; if ((status = decode_restorefh(&xdr)) != 0) goto out; - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4573,11 +4837,13 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_cr goto out; if ((status = decode_getfh(&xdr, res->fh)) != 0) goto out; - if (decode_getfattr(&xdr, res->fattr, res->server) != 0) + if (decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)) != 0) goto out; if ((status = decode_restorefh(&xdr)) != 0) goto out; - decode_getfattr(&xdr, res->dir_fattr, res->server); + decode_getfattr(&xdr, res->dir_fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4609,7 +4875,8 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_g status = decode_putfh(&xdr); if (status) goto out; - status = decode_getfattr(&xdr, res->fattr, res->server); + status = decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4716,7 +4983,8 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos * an ESTALE error. Shouldn't be a problem, * though, since fattr->valid will remain unset. */ - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4748,11 +5016,13 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openr goto out; if (decode_getfh(&xdr, &res->fh) != 0) goto out; - if (decode_getfattr(&xdr, res->f_attr, res->server) != 0) + if (decode_getfattr(&xdr, res->f_attr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)) != 0) goto out; if (decode_restorefh(&xdr) != 0) goto out; - decode_getfattr(&xdr, res->dir_attr, res->server); + decode_getfattr(&xdr, res->dir_attr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4800,7 +5070,8 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nf status = decode_open(&xdr, res); if (status) goto out; - decode_getfattr(&xdr, res->f_attr, res->server); + decode_getfattr(&xdr, res->f_attr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4827,7 +5098,8 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_se status = decode_setattr(&xdr); if (status) goto out; - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -5001,7 +5273,8 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writ status = decode_write(&xdr, res); if (status) goto out; - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); if (!status) status = res->count; out: @@ -5030,7 +5303,8 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, __be32 *p, struct nfs_wri status = decode_commit(&xdr, res); if (status) goto out; - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -5194,7 +5468,8 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nf if (status != 0) goto out; status = decode_delegreturn(&xdr); - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -5222,7 +5497,8 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p, goto out; xdr_enter_page(&xdr, PAGE_SIZE); status = decode_getfattr(&xdr, &res->fs_locations->fattr, - res->fs_locations->server); + res->fs_locations->server, + !RPC_IS_ASYNC(req->rq_task)); out: return status; } diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 96c4ebfa46f4..12c9e66d3f1d 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -18,7 +18,6 @@ #include <linux/sunrpc/clnt.h> #include <linux/nfs_fs.h> #include <linux/nfs_page.h> -#include <linux/smp_lock.h> #include <asm/system.h> @@ -61,17 +60,15 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) return p; } -static void nfs_readdata_free(struct nfs_read_data *p) +void nfs_readdata_free(struct nfs_read_data *p) { if (p && (p->pagevec != &p->page_array[0])) kfree(p->pagevec); mempool_free(p, nfs_rdata_mempool); } -void nfs_readdata_release(void *data) +static void nfs_readdata_release(struct nfs_read_data *rdata) { - struct nfs_read_data *rdata = data; - put_nfs_open_context(rdata->args.context); nfs_readdata_free(rdata); } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 0b4cbdc60abd..de935692d40d 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -73,7 +73,7 @@ enum { Opt_cto, Opt_nocto, Opt_ac, Opt_noac, Opt_lock, Opt_nolock, - Opt_v2, Opt_v3, + Opt_v2, Opt_v3, Opt_v4, Opt_udp, Opt_tcp, Opt_rdma, Opt_acl, Opt_noacl, Opt_rdirplus, Opt_nordirplus, @@ -127,6 +127,7 @@ static const match_table_t nfs_mount_option_tokens = { { Opt_nolock, "nolock" }, { Opt_v2, "v2" }, { Opt_v3, "v3" }, + { Opt_v4, "v4" }, { Opt_udp, "udp" }, { Opt_tcp, "tcp" }, { Opt_rdma, "rdma" }, @@ -158,7 +159,7 @@ static const match_table_t nfs_mount_option_tokens = { { Opt_mountvers, "mountvers=%s" }, { Opt_nfsvers, "nfsvers=%s" }, { Opt_nfsvers, "vers=%s" }, - { Opt_minorversion, "minorversion=%u" }, + { Opt_minorversion, "minorversion=%s" }, { Opt_sec, "sec=%s" }, { Opt_proto, "proto=%s" }, @@ -272,6 +273,10 @@ static const struct super_operations nfs_sops = { }; #ifdef CONFIG_NFS_V4 +static int nfs4_validate_text_mount_data(void *options, + struct nfs_parsed_mount_data *args, const char *dev_name); +static int nfs4_try_mount(int flags, const char *dev_name, + struct nfs_parsed_mount_data *data, struct vfsmount *mnt); static int nfs4_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); static int nfs4_remote_get_sb(struct file_system_type *fs_type, @@ -742,127 +747,23 @@ static int nfs_verify_server_address(struct sockaddr *addr) } } + dfprintk(MOUNT, "NFS: Invalid IP address specified\n"); return 0; } -static void nfs_parse_ipv4_address(char *string, size_t str_len, - struct sockaddr *sap, size_t *addr_len) -{ - struct sockaddr_in *sin = (struct sockaddr_in *)sap; - u8 *addr = (u8 *)&sin->sin_addr.s_addr; - - if (str_len <= INET_ADDRSTRLEN) { - dfprintk(MOUNT, "NFS: parsing IPv4 address %*s\n", - (int)str_len, string); - - sin->sin_family = AF_INET; - *addr_len = sizeof(*sin); - if (in4_pton(string, str_len, addr, '\0', NULL)) - return; - } - - sap->sa_family = AF_UNSPEC; - *addr_len = 0; -} - -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static int nfs_parse_ipv6_scope_id(const char *string, const size_t str_len, - const char *delim, - struct sockaddr_in6 *sin6) -{ - char *p; - size_t len; - - if ((string + str_len) == delim) - return 1; - - if (*delim != IPV6_SCOPE_DELIMITER) - return 0; - - if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)) - return 0; - - len = (string + str_len) - delim - 1; - p = kstrndup(delim + 1, len, GFP_KERNEL); - if (p) { - unsigned long scope_id = 0; - struct net_device *dev; - - dev = dev_get_by_name(&init_net, p); - if (dev != NULL) { - scope_id = dev->ifindex; - dev_put(dev); - } else { - if (strict_strtoul(p, 10, &scope_id) == 0) { - kfree(p); - return 0; - } - } - - kfree(p); - - sin6->sin6_scope_id = scope_id; - dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id); - return 1; - } - - return 0; -} - -static void nfs_parse_ipv6_address(char *string, size_t str_len, - struct sockaddr *sap, size_t *addr_len) -{ - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; - u8 *addr = (u8 *)&sin6->sin6_addr.in6_u; - const char *delim; - - if (str_len <= INET6_ADDRSTRLEN) { - dfprintk(MOUNT, "NFS: parsing IPv6 address %*s\n", - (int)str_len, string); - - sin6->sin6_family = AF_INET6; - *addr_len = sizeof(*sin6); - if (in6_pton(string, str_len, addr, - IPV6_SCOPE_DELIMITER, &delim) != 0) { - if (nfs_parse_ipv6_scope_id(string, str_len, - delim, sin6) != 0) - return; - } - } - - sap->sa_family = AF_UNSPEC; - *addr_len = 0; -} -#else -static void nfs_parse_ipv6_address(char *string, size_t str_len, - struct sockaddr *sap, size_t *addr_len) -{ - sap->sa_family = AF_UNSPEC; - *addr_len = 0; -} -#endif - /* - * Construct a sockaddr based on the contents of a string that contains - * an IP address in presentation format. - * - * If there is a problem constructing the new sockaddr, set the address - * family to AF_UNSPEC. + * Select between a default port value and a user-specified port value. + * If a zero value is set, then autobind will be used. */ -void nfs_parse_ip_address(char *string, size_t str_len, - struct sockaddr *sap, size_t *addr_len) +static void nfs_set_default_port(struct sockaddr *sap, const int parsed_port, + const unsigned short default_port) { - unsigned int i, colons; + unsigned short port = default_port; - colons = 0; - for (i = 0; i < str_len; i++) - if (string[i] == ':') - colons++; + if (parsed_port != NFS_UNSPEC_PORT) + port = parsed_port; - if (colons >= 2) - nfs_parse_ipv6_address(string, str_len, sap, addr_len); - else - nfs_parse_ipv4_address(string, str_len, sap, addr_len); + rpc_set_port(sap, port); } /* @@ -904,8 +805,6 @@ static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt) /* * Parse the value of the 'sec=' option. - * - * The flavor_len setting is for v4 mounts. */ static int nfs_parse_security_flavors(char *value, struct nfs_parsed_mount_data *mnt) @@ -916,53 +815,43 @@ static int nfs_parse_security_flavors(char *value, switch (match_token(value, nfs_secflavor_tokens, args)) { case Opt_sec_none: - mnt->auth_flavor_len = 0; mnt->auth_flavors[0] = RPC_AUTH_NULL; break; case Opt_sec_sys: - mnt->auth_flavor_len = 0; mnt->auth_flavors[0] = RPC_AUTH_UNIX; break; case Opt_sec_krb5: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5; break; case Opt_sec_krb5i: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I; break; case Opt_sec_krb5p: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P; break; case Opt_sec_lkey: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY; break; case Opt_sec_lkeyi: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI; break; case Opt_sec_lkeyp: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP; break; case Opt_sec_spkm: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM; break; case Opt_sec_spkmi: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI; break; case Opt_sec_spkmp: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP; break; default: return 0; } + mnt->auth_flavor_len = 1; return 1; } @@ -1001,7 +890,6 @@ static int nfs_parse_mount_options(char *raw, while ((p = strsep(&raw, ",")) != NULL) { substring_t args[MAX_OPT_ARGS]; unsigned long option; - int int_option; int token; if (!*p) @@ -1047,10 +935,18 @@ static int nfs_parse_mount_options(char *raw, break; case Opt_v2: mnt->flags &= ~NFS_MOUNT_VER3; + mnt->version = 2; break; case Opt_v3: mnt->flags |= NFS_MOUNT_VER3; + mnt->version = 3; break; +#ifdef CONFIG_NFS_V4 + case Opt_v4: + mnt->flags &= ~NFS_MOUNT_VER3; + mnt->version = 4; + break; +#endif case Opt_udp: mnt->flags &= ~NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; @@ -1264,20 +1160,33 @@ static int nfs_parse_mount_options(char *raw, switch (option) { case NFS2_VERSION: mnt->flags &= ~NFS_MOUNT_VER3; + mnt->version = 2; break; case NFS3_VERSION: mnt->flags |= NFS_MOUNT_VER3; + mnt->version = 3; break; +#ifdef CONFIG_NFS_V4 + case NFS4_VERSION: + mnt->flags &= ~NFS_MOUNT_VER3; + mnt->version = 4; + break; +#endif default: goto out_invalid_value; } break; case Opt_minorversion: - if (match_int(args, &int_option)) - return 0; - if (int_option < 0 || int_option > NFS4_MAX_MINOR_VERSION) - return 0; - mnt->minorversion = int_option; + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + rc = strict_strtoul(string, 10, &option); + kfree(string); + if (rc != 0) + goto out_invalid_value; + if (option > NFS4_MAX_MINOR_VERSION) + goto out_invalid_value; + mnt->minorversion = option; break; /* @@ -1352,11 +1261,14 @@ static int nfs_parse_mount_options(char *raw, string = match_strdup(args); if (string == NULL) goto out_nomem; - nfs_parse_ip_address(string, strlen(string), - (struct sockaddr *) - &mnt->nfs_server.address, - &mnt->nfs_server.addrlen); + mnt->nfs_server.addrlen = + rpc_pton(string, strlen(string), + (struct sockaddr *) + &mnt->nfs_server.address, + sizeof(mnt->nfs_server.address)); kfree(string); + if (mnt->nfs_server.addrlen == 0) + goto out_invalid_address; break; case Opt_clientaddr: string = match_strdup(args); @@ -1376,11 +1288,14 @@ static int nfs_parse_mount_options(char *raw, string = match_strdup(args); if (string == NULL) goto out_nomem; - nfs_parse_ip_address(string, strlen(string), - (struct sockaddr *) - &mnt->mount_server.address, - &mnt->mount_server.addrlen); + mnt->mount_server.addrlen = + rpc_pton(string, strlen(string), + (struct sockaddr *) + &mnt->mount_server.address, + sizeof(mnt->mount_server.address)); kfree(string); + if (mnt->mount_server.addrlen == 0) + goto out_invalid_address; break; case Opt_lookupcache: string = match_strdup(args); @@ -1432,8 +1347,11 @@ static int nfs_parse_mount_options(char *raw, return 1; +out_invalid_address: + printk(KERN_INFO "NFS: bad IP address specified: %s\n", p); + return 0; out_invalid_value: - printk(KERN_INFO "NFS: bad mount option value specified: %s \n", p); + printk(KERN_INFO "NFS: bad mount option value specified: %s\n", p); return 0; out_nomem: printk(KERN_INFO "NFS: not enough memory to parse option\n"); @@ -1445,13 +1363,60 @@ out_security_failure: } /* + * Match the requested auth flavors with the list returned by + * the server. Returns zero and sets the mount's authentication + * flavor on success; returns -EACCES if server does not support + * the requested flavor. + */ +static int nfs_walk_authlist(struct nfs_parsed_mount_data *args, + struct nfs_mount_request *request) +{ + unsigned int i, j, server_authlist_len = *(request->auth_flav_len); + + /* + * Certain releases of Linux's mountd return an empty + * flavor list. To prevent behavioral regression with + * these servers (ie. rejecting mounts that used to + * succeed), revert to pre-2.6.32 behavior (no checking) + * if the returned flavor list is empty. + */ + if (server_authlist_len == 0) + return 0; + + /* + * We avoid sophisticated negotiating here, as there are + * plenty of cases where we can get it wrong, providing + * either too little or too much security. + * + * RFC 2623, section 2.7 suggests we SHOULD prefer the + * flavor listed first. However, some servers list + * AUTH_NULL first. Our caller plants AUTH_SYS, the + * preferred default, in args->auth_flavors[0] if user + * didn't specify sec= mount option. + */ + for (i = 0; i < args->auth_flavor_len; i++) + for (j = 0; j < server_authlist_len; j++) + if (args->auth_flavors[i] == request->auth_flavs[j]) { + dfprintk(MOUNT, "NFS: using auth flavor %d\n", + request->auth_flavs[j]); + args->auth_flavors[0] = request->auth_flavs[j]; + return 0; + } + + dfprintk(MOUNT, "NFS: server does not support requested auth flavor\n"); + nfs_umount(request); + return -EACCES; +} + +/* * Use the remote server's MOUNT service to request the NFS file handle * corresponding to the provided path. */ static int nfs_try_mount(struct nfs_parsed_mount_data *args, struct nfs_fh *root_fh) { - unsigned int auth_flavor_len = 0; + rpc_authflavor_t server_authlist[NFS_MAX_SECFLAVORS]; + unsigned int server_authlist_len = ARRAY_SIZE(server_authlist); struct nfs_mount_request request = { .sap = (struct sockaddr *) &args->mount_server.address, @@ -1459,7 +1424,8 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args, .protocol = args->mount_server.protocol, .fh = root_fh, .noresvport = args->flags & NFS_MOUNT_NORESVPORT, - .auth_flav_len = &auth_flavor_len, + .auth_flav_len = &server_authlist_len, + .auth_flavs = server_authlist, }; int status; @@ -1485,23 +1451,25 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args, args->mount_server.addrlen = args->nfs_server.addrlen; } request.salen = args->mount_server.addrlen; - - /* - * autobind will be used if mount_server.port == 0 - */ - nfs_set_port(request.sap, args->mount_server.port); + nfs_set_default_port(request.sap, args->mount_server.port, 0); /* * Now ask the mount server to map our export path * to a file handle. */ status = nfs_mount(&request); - if (status == 0) - return 0; + if (status != 0) { + dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n", + request.hostname, status); + return status; + } - dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n", - request.hostname, status); - return status; + /* + * MNTv1 (NFSv2) does not support auth flavor negotiation. + */ + if (args->mount_server.version != NFS_MNT3_VERSION) + return 0; + return nfs_walk_authlist(args, &request); } static int nfs_parse_simple_hostname(const char *dev_name, @@ -1661,6 +1629,7 @@ static int nfs_validate_mount_data(void *options, const char *dev_name) { struct nfs_mount_data *data = (struct nfs_mount_data *)options; + struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; if (data == NULL) goto out_no_data; @@ -1672,10 +1641,12 @@ static int nfs_validate_mount_data(void *options, args->acregmax = NFS_DEF_ACREGMAX; args->acdirmin = NFS_DEF_ACDIRMIN; args->acdirmax = NFS_DEF_ACDIRMAX; - args->mount_server.port = 0; /* autobind unless user sets port */ - args->nfs_server.port = 0; /* autobind unless user sets port */ + args->mount_server.port = NFS_UNSPEC_PORT; + args->nfs_server.port = NFS_UNSPEC_PORT; args->nfs_server.protocol = XPRT_TRANSPORT_TCP; args->auth_flavors[0] = RPC_AUTH_UNIX; + args->auth_flavor_len = 1; + args->minorversion = 0; switch (data->version) { case 1: @@ -1697,8 +1668,11 @@ static int nfs_validate_mount_data(void *options, if (data->root.size > NFS3_FHSIZE || data->root.size == 0) goto out_invalid_fh; mntfh->size = data->root.size; - } else + args->version = 3; + } else { mntfh->size = NFS2_FHSIZE; + args->version = 2; + } memcpy(mntfh->data, data->root.data, mntfh->size); @@ -1720,11 +1694,9 @@ static int nfs_validate_mount_data(void *options, args->acdirmin = data->acdirmin; args->acdirmax = data->acdirmax; - memcpy(&args->nfs_server.address, &data->addr, - sizeof(data->addr)); + memcpy(sap, &data->addr, sizeof(data->addr)); args->nfs_server.addrlen = sizeof(data->addr); - if (!nfs_verify_server_address((struct sockaddr *) - &args->nfs_server.address)) + if (!nfs_verify_server_address(sap)) goto out_no_address; if (!(data->flags & NFS_MOUNT_TCP)) @@ -1772,12 +1744,18 @@ static int nfs_validate_mount_data(void *options, if (nfs_parse_mount_options((char *)options, args) == 0) return -EINVAL; - if (!nfs_verify_server_address((struct sockaddr *) - &args->nfs_server.address)) + if (!nfs_verify_server_address(sap)) goto out_no_address; - nfs_set_port((struct sockaddr *)&args->nfs_server.address, - args->nfs_server.port); + if (args->version == 4) +#ifdef CONFIG_NFS_V4 + return nfs4_validate_text_mount_data(options, + args, dev_name); +#else + goto out_v4_not_compiled; +#endif + + nfs_set_default_port(sap, args->nfs_server.port, 0); nfs_set_mount_transport_protocol(args); @@ -1825,6 +1803,12 @@ out_v3_not_compiled: return -EPROTONOSUPPORT; #endif /* !CONFIG_NFS_V3 */ +#ifndef CONFIG_NFS_V4 +out_v4_not_compiled: + dfprintk(MOUNT, "NFS: NFSv4 is not compiled into kernel\n"); + return -EPROTONOSUPPORT; +#endif /* !CONFIG_NFS_V4 */ + out_nomem: dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n"); return -ENOMEM; @@ -1934,6 +1918,8 @@ static inline void nfs_initialise_sb(struct super_block *sb) if (server->flags & NFS_MOUNT_NOAC) sb->s_flags |= MS_SYNCHRONOUS; + sb->s_bdi = &server->backing_dev_info; + nfs_super_set_maxbytes(sb, server->maxfilesize); } @@ -2120,6 +2106,14 @@ static int nfs_get_sb(struct file_system_type *fs_type, if (error < 0) goto out; +#ifdef CONFIG_NFS_V4 + if (data->version == 4) { + error = nfs4_try_mount(flags, dev_name, data, mnt); + kfree(data->client_address); + goto out; + } +#endif /* CONFIG_NFS_V4 */ + /* Get a volume representation */ server = nfs_create_server(data, mntfh); if (IS_ERR(server)) { @@ -2317,6 +2311,43 @@ static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args) args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3); } +static int nfs4_validate_text_mount_data(void *options, + struct nfs_parsed_mount_data *args, + const char *dev_name) +{ + struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; + + nfs_set_default_port(sap, args->nfs_server.port, NFS_PORT); + + nfs_validate_transport_protocol(args); + + nfs4_validate_mount_flags(args); + + if (args->version != 4) { + dfprintk(MOUNT, + "NFS4: Illegal mount version\n"); + return -EINVAL; + } + + if (args->auth_flavor_len > 1) { + dfprintk(MOUNT, + "NFS4: Too many RPC auth flavours specified\n"); + return -EINVAL; + } + + if (args->client_address == NULL) { + dfprintk(MOUNT, + "NFS4: mount program didn't pass callback address\n"); + return -EINVAL; + } + + return nfs_parse_devname(dev_name, + &args->nfs_server.hostname, + NFS4_MAXNAMLEN, + &args->nfs_server.export_path, + NFS4_MAXPATHLEN); +} + /* * Validate NFSv4 mount options */ @@ -2324,7 +2355,7 @@ static int nfs4_validate_mount_data(void *options, struct nfs_parsed_mount_data *args, const char *dev_name) { - struct sockaddr_in *ap; + struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; struct nfs4_mount_data *data = (struct nfs4_mount_data *)options; char *c; @@ -2337,23 +2368,22 @@ static int nfs4_validate_mount_data(void *options, args->acregmax = NFS_DEF_ACREGMAX; args->acdirmin = NFS_DEF_ACDIRMIN; args->acdirmax = NFS_DEF_ACDIRMAX; - args->nfs_server.port = NFS_PORT; /* 2049 unless user set port= */ + args->nfs_server.port = NFS_UNSPEC_PORT; args->auth_flavors[0] = RPC_AUTH_UNIX; - args->auth_flavor_len = 0; + args->auth_flavor_len = 1; + args->version = 4; args->minorversion = 0; switch (data->version) { case 1: - ap = (struct sockaddr_in *)&args->nfs_server.address; if (data->host_addrlen > sizeof(args->nfs_server.address)) goto out_no_address; if (data->host_addrlen == 0) goto out_no_address; args->nfs_server.addrlen = data->host_addrlen; - if (copy_from_user(ap, data->host_addr, data->host_addrlen)) + if (copy_from_user(sap, data->host_addr, data->host_addrlen)) return -EFAULT; - if (!nfs_verify_server_address((struct sockaddr *) - &args->nfs_server.address)) + if (!nfs_verify_server_address(sap)) goto out_no_address; if (data->auth_flavourlen) { @@ -2399,39 +2429,14 @@ static int nfs4_validate_mount_data(void *options, nfs_validate_transport_protocol(args); break; - default: { - int status; - + default: if (nfs_parse_mount_options((char *)options, args) == 0) return -EINVAL; - if (!nfs_verify_server_address((struct sockaddr *) - &args->nfs_server.address)) + if (!nfs_verify_server_address(sap)) return -EINVAL; - nfs_set_port((struct sockaddr *)&args->nfs_server.address, - args->nfs_server.port); - - nfs_validate_transport_protocol(args); - - nfs4_validate_mount_flags(args); - - if (args->auth_flavor_len > 1) - goto out_inval_auth; - - if (args->client_address == NULL) - goto out_no_client_address; - - status = nfs_parse_devname(dev_name, - &args->nfs_server.hostname, - NFS4_MAXNAMLEN, - &args->nfs_server.export_path, - NFS4_MAXPATHLEN); - if (status < 0) - return status; - - break; - } + return nfs4_validate_text_mount_data(options, args, dev_name); } return 0; @@ -2448,10 +2453,6 @@ out_inval_auth: out_no_address: dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n"); return -EINVAL; - -out_no_client_address: - dfprintk(MOUNT, "NFS4: mount program didn't pass callback address\n"); - return -EINVAL; } /* @@ -2618,6 +2619,34 @@ out_err: return ret; } +static int nfs4_try_mount(int flags, const char *dev_name, + struct nfs_parsed_mount_data *data, + struct vfsmount *mnt) +{ + char *export_path; + struct vfsmount *root_mnt; + int error; + + dfprintk(MOUNT, "--> nfs4_try_mount()\n"); + + export_path = data->nfs_server.export_path; + data->nfs_server.export_path = "/"; + root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, data, + data->nfs_server.hostname); + data->nfs_server.export_path = export_path; + + error = PTR_ERR(root_mnt); + if (IS_ERR(root_mnt)) + goto out; + + error = nfs_follow_remote_path(root_mnt, export_path, mnt); + +out: + dfprintk(MOUNT, "<-- nfs4_try_mount() = %d%s\n", error, + error != 0 ? " [error]" : ""); + return error; +} + /* * Get the superblock for an NFS4 mountpoint */ @@ -2625,8 +2654,6 @@ static int nfs4_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) { struct nfs_parsed_mount_data *data; - char *export_path; - struct vfsmount *root_mnt; int error = -ENOMEM; data = kzalloc(sizeof(*data), GFP_KERNEL); @@ -2638,17 +2665,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type, if (error < 0) goto out; - export_path = data->nfs_server.export_path; - data->nfs_server.export_path = "/"; - root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, data, - data->nfs_server.hostname); - data->nfs_server.export_path = export_path; - - error = PTR_ERR(root_mnt); - if (IS_ERR(root_mnt)) - goto out; - - error = nfs_follow_remote_path(root_mnt, export_path, mnt); + error = nfs4_try_mount(flags, dev_name, data, mnt); out: kfree(data->client_address); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index ce728829f79a..53eb26c16b50 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -13,6 +13,7 @@ #include <linux/file.h> #include <linux/writeback.h> #include <linux/swap.h> +#include <linux/migrate.h> #include <linux/sunrpc/clnt.h> #include <linux/nfs_fs.h> @@ -26,6 +27,7 @@ #include "internal.h" #include "iostat.h" #include "nfs4_fs.h" +#include "fscache.h" #define NFSDBG_FACILITY NFSDBG_PAGECACHE @@ -87,17 +89,15 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) return p; } -static void nfs_writedata_free(struct nfs_write_data *p) +void nfs_writedata_free(struct nfs_write_data *p) { if (p && (p->pagevec != &p->page_array[0])) kfree(p->pagevec); mempool_free(p, nfs_wdata_mempool); } -void nfs_writedata_release(void *data) +static void nfs_writedata_release(struct nfs_write_data *wdata) { - struct nfs_write_data *wdata = data; - put_nfs_open_context(wdata->args.context); nfs_writedata_free(wdata); } @@ -202,8 +202,10 @@ static int nfs_set_page_writeback(struct page *page) struct nfs_server *nfss = NFS_SERVER(inode); if (atomic_long_inc_return(&nfss->writeback) > - NFS_CONGESTION_ON_THRESH) - set_bdi_congested(&nfss->backing_dev_info, WRITE); + NFS_CONGESTION_ON_THRESH) { + set_bdi_congested(&nfss->backing_dev_info, + BLK_RW_ASYNC); + } } return ret; } @@ -215,27 +217,20 @@ static void nfs_end_page_writeback(struct page *page) end_page_writeback(page); if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) - clear_bdi_congested(&nfss->backing_dev_info, WRITE); + clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); } -/* - * Find an associated nfs write request, and prepare to flush it out - * May return an error if the user signalled nfs_wait_on_request(). - */ -static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, - struct page *page) +static struct nfs_page *nfs_find_and_lock_request(struct page *page) { struct inode *inode = page->mapping->host; struct nfs_page *req; int ret; spin_lock(&inode->i_lock); - for(;;) { + for (;;) { req = nfs_page_find_request_locked(page); - if (req == NULL) { - spin_unlock(&inode->i_lock); - return 0; - } + if (req == NULL) + break; if (nfs_set_page_tag_locked(req)) break; /* Note: If we hold the page lock, as is the case in nfs_writepage, @@ -247,23 +242,40 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, ret = nfs_wait_on_request(req); nfs_release_request(req); if (ret != 0) - return ret; + return ERR_PTR(ret); spin_lock(&inode->i_lock); } - if (test_bit(PG_CLEAN, &req->wb_flags)) { - spin_unlock(&inode->i_lock); - BUG(); - } - if (nfs_set_page_writeback(page) != 0) { - spin_unlock(&inode->i_lock); - BUG(); - } spin_unlock(&inode->i_lock); + return req; +} + +/* + * Find an associated nfs write request, and prepare to flush it out + * May return an error if the user signalled nfs_wait_on_request(). + */ +static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, + struct page *page) +{ + struct nfs_page *req; + int ret = 0; + + req = nfs_find_and_lock_request(page); + if (!req) + goto out; + ret = PTR_ERR(req); + if (IS_ERR(req)) + goto out; + + ret = nfs_set_page_writeback(page); + BUG_ON(ret != 0); + BUG_ON(test_bit(PG_CLEAN, &req->wb_flags)); + if (!nfs_pageio_add_request(pgio, req)) { nfs_redirty_request(req); - return pgio->pg_error; + ret = pgio->pg_error; } - return 0; +out: + return ret; } static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) @@ -1478,7 +1490,6 @@ static int nfs_write_mapping(struct address_space *mapping, int how) .nr_to_write = LONG_MAX, .range_start = 0, .range_end = LLONG_MAX, - .for_writepages = 1, }; return __nfs_write_mapping(mapping, &wbc, how); @@ -1580,6 +1591,41 @@ int nfs_wb_page(struct inode *inode, struct page* page) return nfs_wb_page_priority(inode, page, FLUSH_STABLE); } +#ifdef CONFIG_MIGRATION +int nfs_migrate_page(struct address_space *mapping, struct page *newpage, + struct page *page) +{ + struct nfs_page *req; + int ret; + + if (PageFsCache(page)) + nfs_fscache_release_page(page, GFP_KERNEL); + + req = nfs_find_and_lock_request(page); + ret = PTR_ERR(req); + if (IS_ERR(req)) + goto out; + + ret = migrate_page(mapping, newpage, page); + if (!req) + goto out; + if (ret) + goto out_unlock; + page_cache_get(newpage); + req->wb_page = newpage; + SetPagePrivate(newpage); + set_page_private(newpage, page_private(page)); + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); +out_unlock: + nfs_clear_page_tag_locked(req); + nfs_release_request(req); +out: + return ret; +} +#endif + int __init nfs_init_writepagecache(void) { nfs_wdata_cachep = kmem_cache_create("nfs_write_data", diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 5573508f707f..36fcabbf5186 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -34,6 +34,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) int flags = nfsexp_flags(rqstp, exp); int ret; + validate_process_creds(); + /* discard any old override before preparing the new set */ revert_creds(get_cred(current->real_cred)); new = prepare_creds(); @@ -86,8 +88,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) else new->cap_effective = cap_raise_nfsd_set(new->cap_effective, new->cap_permitted); + validate_process_creds(); put_cred(override_creds(new)); put_cred(new); + validate_process_creds(); return 0; oom: diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index b92a27629fb7..d9462643155c 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -85,6 +85,11 @@ static void expkey_request(struct cache_detail *cd, (*bpp)[-1] = '\n'; } +static int expkey_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall(cd, h, expkey_request); +} + static struct svc_expkey *svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old); static struct svc_expkey *svc_expkey_lookup(struct svc_expkey *); static struct cache_detail svc_expkey_cache; @@ -259,7 +264,7 @@ static struct cache_detail svc_expkey_cache = { .hash_table = expkey_table, .name = "nfsd.fh", .cache_put = expkey_put, - .cache_request = expkey_request, + .cache_upcall = expkey_upcall, .cache_parse = expkey_parse, .cache_show = expkey_show, .match = expkey_match, @@ -355,6 +360,11 @@ static void svc_export_request(struct cache_detail *cd, (*bpp)[-1] = '\n'; } +static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall(cd, h, svc_export_request); +} + static struct svc_export *svc_export_update(struct svc_export *new, struct svc_export *old); static struct svc_export *svc_export_lookup(struct svc_export *); @@ -724,7 +734,7 @@ struct cache_detail svc_export_cache = { .hash_table = export_table, .name = "nfsd.export", .cache_put = svc_export_put, - .cache_request = svc_export_request, + .cache_upcall = svc_export_upcall, .cache_parse = svc_export_parse, .cache_show = svc_export_show, .match = svc_export_match, diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index 5b398421b051..cdfa86fa1471 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -146,6 +146,12 @@ idtoname_request(struct cache_detail *cd, struct cache_head *ch, char **bpp, } static int +idtoname_upcall(struct cache_detail *cd, struct cache_head *ch) +{ + return sunrpc_cache_pipe_upcall(cd, ch, idtoname_request); +} + +static int idtoname_match(struct cache_head *ca, struct cache_head *cb) { struct ent *a = container_of(ca, struct ent, h); @@ -175,10 +181,10 @@ idtoname_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) } static void -warn_no_idmapd(struct cache_detail *detail) +warn_no_idmapd(struct cache_detail *detail, int has_died) { printk("nfsd: nfsv4 idmapping failing: has idmapd %s?\n", - detail->last_close? "died" : "not been started"); + has_died ? "died" : "not been started"); } @@ -192,7 +198,7 @@ static struct cache_detail idtoname_cache = { .hash_table = idtoname_table, .name = "nfs4.idtoname", .cache_put = ent_put, - .cache_request = idtoname_request, + .cache_upcall = idtoname_upcall, .cache_parse = idtoname_parse, .cache_show = idtoname_show, .warn_no_listener = warn_no_idmapd, @@ -325,6 +331,12 @@ nametoid_request(struct cache_detail *cd, struct cache_head *ch, char **bpp, } static int +nametoid_upcall(struct cache_detail *cd, struct cache_head *ch) +{ + return sunrpc_cache_pipe_upcall(cd, ch, nametoid_request); +} + +static int nametoid_match(struct cache_head *ca, struct cache_head *cb) { struct ent *a = container_of(ca, struct ent, h); @@ -363,7 +375,7 @@ static struct cache_detail nametoid_cache = { .hash_table = nametoid_table, .name = "nfs4.nametoid", .cache_put = ent_put, - .cache_request = nametoid_request, + .cache_upcall = nametoid_upcall, .cache_parse = nametoid_parse, .cache_show = nametoid_show, .warn_no_listener = warn_no_idmapd, diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 1250fb978ac1..7e906c5b7671 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -25,7 +25,6 @@ #include <linux/init.h> #include <linux/inet.h> #include <linux/string.h> -#include <linux/smp_lock.h> #include <linux/ctype.h> #include <linux/nfs.h> @@ -38,6 +37,7 @@ #include <linux/nfsd/xdr.h> #include <linux/nfsd/syscall.h> #include <linux/lockd/lockd.h> +#include <linux/sunrpc/clnt.h> #include <asm/uaccess.h> #include <net/ipv6.h> @@ -491,22 +491,18 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size) * * Input: * buf: '\n'-terminated C string containing a - * presentation format IPv4 address + * presentation format IP address * size: length of C string in @buf * Output: * On success: returns zero if all specified locks were released; * returns one if one or more locks were not released * On error: return code is negative errno value - * - * Note: Only AF_INET client addresses are passed in */ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size) { - struct sockaddr_in sin = { - .sin_family = AF_INET, - }; - int b1, b2, b3, b4; - char c; + struct sockaddr_storage address; + struct sockaddr *sap = (struct sockaddr *)&address; + size_t salen = sizeof(address); char *fo_path; /* sanity check */ @@ -520,14 +516,10 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size) if (qword_get(&buf, fo_path, size) < 0) return -EINVAL; - /* get ipv4 address */ - if (sscanf(fo_path, "%u.%u.%u.%u%c", &b1, &b2, &b3, &b4, &c) != 4) - return -EINVAL; - if (b1 > 255 || b2 > 255 || b3 > 255 || b4 > 255) + if (rpc_pton(fo_path, size, sap, salen) == 0) return -EINVAL; - sin.sin_addr.s_addr = htonl((b1 << 24) | (b2 << 16) | (b3 << 8) | b4); - return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin); + return nlmsvc_unlock_all_by_ip(sap); } /** diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index d4c9884cd54b..24d58adfe5fd 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -18,7 +18,6 @@ #include <linux/unistd.h> #include <linux/slab.h> #include <linux/smp.h> -#include <linux/smp_lock.h> #include <linux/freezer.h> #include <linux/fs_struct.h> #include <linux/kthread.h> @@ -497,7 +496,9 @@ nfsd(void *vrqstp) /* Lock the export hash tables for reading. */ exp_readlock(); + validate_process_creds(); svc_process(rqstp); + validate_process_creds(); /* Unlock export hash tables */ exp_readunlock(); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 23341c1063bc..8fa09bfbcba7 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -684,6 +684,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, __be32 err; int host_err; + validate_process_creds(); + /* * If we get here, then the client has already done an "open", * and (hopefully) checked permission - so allow OWNER_OVERRIDE @@ -740,6 +742,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, out_nfserr: err = nfserrno(host_err); out: + validate_process_creds(); return err; } diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig new file mode 100644 index 000000000000..251da07b2a1d --- /dev/null +++ b/fs/nilfs2/Kconfig @@ -0,0 +1,25 @@ +config NILFS2_FS + tristate "NILFS2 file system support (EXPERIMENTAL)" + depends on EXPERIMENTAL + select CRC32 + help + NILFS2 is a log-structured file system (LFS) supporting continuous + snapshotting. In addition to versioning capability of the entire + file system, users can even restore files mistakenly overwritten or + destroyed just a few seconds ago. Since this file system can keep + consistency like conventional LFS, it achieves quick recovery after + system crashes. + + NILFS2 creates a number of checkpoints every few seconds or per + synchronous write basis (unless there is no change). Users can + select significant versions among continuously created checkpoints, + and can change them into snapshots which will be preserved for long + periods until they are changed back to checkpoints. Each + snapshot is mountable as a read-only file system concurrently with + its writable mount, and this feature is convenient for online backup. + + Some features including atime, extended attributes, and POSIX ACLs, + are not supported yet. + + To compile this file system support as a module, choose M here: the + module will be called nilfs2. If unsure, say N. diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index 36df60b6d8a4..08834df6ec68 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -36,6 +36,26 @@ struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap) return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode)); } +/** + * nilfs_bmap_lookup_at_level - find a data block or node block + * @bmap: bmap + * @key: key + * @level: level + * @ptrp: place to store the value associated to @key + * + * Description: nilfs_bmap_lookup_at_level() finds a record whose key + * matches @key in the block at @level of the bmap. + * + * Return Value: On success, 0 is returned and the record associated with @key + * is stored in the place pointed by @ptrp. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - A record associated with @key does not exist. + */ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level, __u64 *ptrp) { @@ -69,39 +89,6 @@ int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp, return ret; } -/** - * nilfs_bmap_lookup - find a record - * @bmap: bmap - * @key: key - * @recp: pointer to record - * - * Description: nilfs_bmap_lookup() finds a record whose key matches @key in - * @bmap. - * - * Return Value: On success, 0 is returned and the record associated with @key - * is stored in the place pointed by @recp. On error, one of the following - * negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-ENOENT - A record associated with @key does not exist. - */ -int nilfs_bmap_lookup(struct nilfs_bmap *bmap, - unsigned long key, - unsigned long *recp) -{ - __u64 ptr; - int ret; - - /* XXX: use macro for level 1 */ - ret = nilfs_bmap_lookup_at_level(bmap, key, 1, &ptr); - if (recp != NULL) - *recp = ptr; - return ret; -} - static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) { __u64 keys[NILFS_BMAP_SMALL_HIGH + 1]; @@ -469,105 +456,8 @@ __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap) (entries_per_group / NILFS_BMAP_GROUP_DIV); } -int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) -{ - return nilfs_dat_prepare_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req); -} - -void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) -{ - nilfs_dat_commit_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req); -} - -void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) -{ - nilfs_dat_abort_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req); -} - -int nilfs_bmap_start_v(struct nilfs_bmap *bmap, union nilfs_bmap_ptr_req *req, - sector_t blocknr) -{ - struct inode *dat = nilfs_bmap_get_dat(bmap); - int ret; - - ret = nilfs_dat_prepare_start(dat, &req->bpr_req); - if (likely(!ret)) - nilfs_dat_commit_start(dat, &req->bpr_req, blocknr); - return ret; -} - -int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) -{ - return nilfs_dat_prepare_end(nilfs_bmap_get_dat(bmap), &req->bpr_req); -} - -void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) -{ - nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, - bmap->b_ptr_type == NILFS_BMAP_PTR_VS); -} - -void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) -{ - nilfs_dat_abort_end(nilfs_bmap_get_dat(bmap), &req->bpr_req); -} - -int nilfs_bmap_move_v(const struct nilfs_bmap *bmap, __u64 vblocknr, - sector_t blocknr) -{ - return nilfs_dat_move(nilfs_bmap_get_dat(bmap), vblocknr, blocknr); -} - -int nilfs_bmap_mark_dirty(const struct nilfs_bmap *bmap, __u64 vblocknr) -{ - return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), vblocknr); -} - -int nilfs_bmap_prepare_update_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *oldreq, - union nilfs_bmap_ptr_req *newreq) -{ - struct inode *dat = nilfs_bmap_get_dat(bmap); - int ret; - - ret = nilfs_dat_prepare_end(dat, &oldreq->bpr_req); - if (ret < 0) - return ret; - ret = nilfs_dat_prepare_alloc(dat, &newreq->bpr_req); - if (ret < 0) - nilfs_dat_abort_end(dat, &oldreq->bpr_req); - - return ret; -} - -void nilfs_bmap_commit_update_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *oldreq, - union nilfs_bmap_ptr_req *newreq) -{ - struct inode *dat = nilfs_bmap_get_dat(bmap); - - nilfs_dat_commit_end(dat, &oldreq->bpr_req, - bmap->b_ptr_type == NILFS_BMAP_PTR_VS); - nilfs_dat_commit_alloc(dat, &newreq->bpr_req); -} - -void nilfs_bmap_abort_update_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *oldreq, - union nilfs_bmap_ptr_req *newreq) -{ - struct inode *dat = nilfs_bmap_get_dat(bmap); - - nilfs_dat_abort_end(dat, &oldreq->bpr_req); - nilfs_dat_abort_alloc(dat, &newreq->bpr_req); -} - static struct lock_class_key nilfs_bmap_dat_lock_key; +static struct lock_class_key nilfs_bmap_mdt_lock_key; /** * nilfs_bmap_read - read a bmap from an inode @@ -603,7 +493,11 @@ int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode) bmap->b_ptr_type = NILFS_BMAP_PTR_VS; bmap->b_last_allocated_key = 0; bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR; + lockdep_set_class(&bmap->b_sem, &nilfs_bmap_mdt_lock_key); break; + case NILFS_IFILE_INO: + lockdep_set_class(&bmap->b_sem, &nilfs_bmap_mdt_lock_key); + /* Fall through */ default: bmap->b_ptr_type = NILFS_BMAP_PTR_VM; bmap->b_last_allocated_key = 0; diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h index b2890cdcef12..9980d7dbab91 100644 --- a/fs/nilfs2/bmap.h +++ b/fs/nilfs2/bmap.h @@ -28,6 +28,7 @@ #include <linux/buffer_head.h> #include <linux/nilfs2_fs.h> #include "alloc.h" +#include "dat.h" #define NILFS_BMAP_INVALID_PTR 0 @@ -141,7 +142,6 @@ struct nilfs_bmap { int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *); int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *); void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *); -int nilfs_bmap_lookup(struct nilfs_bmap *, unsigned long, unsigned long *); int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned); int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long); int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long); @@ -160,90 +160,76 @@ void nilfs_bmap_init_gcdat(struct nilfs_bmap *, struct nilfs_bmap *); void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *); +static inline int nilfs_bmap_lookup(struct nilfs_bmap *bmap, __u64 key, + __u64 *ptr) +{ + return nilfs_bmap_lookup_at_level(bmap, key, 1, ptr); +} + /* * Internal use only */ struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *); -int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *); -void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *); -void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *); static inline int nilfs_bmap_prepare_alloc_ptr(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) + union nilfs_bmap_ptr_req *req, + struct inode *dat) { - if (NILFS_BMAP_USE_VBN(bmap)) - return nilfs_bmap_prepare_alloc_v(bmap, req); + if (dat) + return nilfs_dat_prepare_alloc(dat, &req->bpr_req); /* ignore target ptr */ req->bpr_ptr = bmap->b_last_allocated_ptr++; return 0; } static inline void nilfs_bmap_commit_alloc_ptr(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) + union nilfs_bmap_ptr_req *req, + struct inode *dat) { - if (NILFS_BMAP_USE_VBN(bmap)) - nilfs_bmap_commit_alloc_v(bmap, req); + if (dat) + nilfs_dat_commit_alloc(dat, &req->bpr_req); } static inline void nilfs_bmap_abort_alloc_ptr(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) + union nilfs_bmap_ptr_req *req, + struct inode *dat) { - if (NILFS_BMAP_USE_VBN(bmap)) - nilfs_bmap_abort_alloc_v(bmap, req); + if (dat) + nilfs_dat_abort_alloc(dat, &req->bpr_req); else bmap->b_last_allocated_ptr--; } -int nilfs_bmap_prepare_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *); -void nilfs_bmap_commit_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *); -void nilfs_bmap_abort_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *); - static inline int nilfs_bmap_prepare_end_ptr(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) + union nilfs_bmap_ptr_req *req, + struct inode *dat) { - return NILFS_BMAP_USE_VBN(bmap) ? - nilfs_bmap_prepare_end_v(bmap, req) : 0; + return dat ? nilfs_dat_prepare_end(dat, &req->bpr_req) : 0; } static inline void nilfs_bmap_commit_end_ptr(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) + union nilfs_bmap_ptr_req *req, + struct inode *dat) { - if (NILFS_BMAP_USE_VBN(bmap)) - nilfs_bmap_commit_end_v(bmap, req); + if (dat) + nilfs_dat_commit_end(dat, &req->bpr_req, + bmap->b_ptr_type == NILFS_BMAP_PTR_VS); } static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) + union nilfs_bmap_ptr_req *req, + struct inode *dat) { - if (NILFS_BMAP_USE_VBN(bmap)) - nilfs_bmap_abort_end_v(bmap, req); + if (dat) + nilfs_dat_abort_end(dat, &req->bpr_req); } -int nilfs_bmap_start_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *, - sector_t); -int nilfs_bmap_move_v(const struct nilfs_bmap *, __u64, sector_t); -int nilfs_bmap_mark_dirty(const struct nilfs_bmap *, __u64); - - __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *, const struct buffer_head *); __u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64); __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *); -int nilfs_bmap_prepare_update_v(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *, - union nilfs_bmap_ptr_req *); -void nilfs_bmap_commit_update_v(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *, - union nilfs_bmap_ptr_req *); -void nilfs_bmap_abort_update_v(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *, - union nilfs_bmap_ptr_req *); - void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int); void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int); diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 7e0b61be212e..c668bca579c1 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -209,6 +209,7 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc, * We cannot call radix_tree_preload for the kernels older * than 2.6.23, because it is not exported for modules. */ +retry: err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); if (err) goto failed_unlock; @@ -219,7 +220,6 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc, (unsigned long long)oldkey, (unsigned long long)newkey); -retry: spin_lock_irq(&btnc->tree_lock); err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page); spin_unlock_irq(&btnc->tree_lock); diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index aa412724b64e..e25b507a474f 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -71,21 +71,17 @@ void nilfs_btree_path_cache_destroy(void) kmem_cache_destroy(nilfs_btree_path_cache); } -static inline struct nilfs_btree_path * -nilfs_btree_alloc_path(const struct nilfs_btree *btree) +static inline struct nilfs_btree_path *nilfs_btree_alloc_path(void) { - return (struct nilfs_btree_path *) - kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS); + return kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS); } -static inline void nilfs_btree_free_path(const struct nilfs_btree *btree, - struct nilfs_btree_path *path) +static inline void nilfs_btree_free_path(struct nilfs_btree_path *path) { kmem_cache_free(nilfs_btree_path_cache, path); } -static void nilfs_btree_init_path(const struct nilfs_btree *btree, - struct nilfs_btree_path *path) +static void nilfs_btree_init_path(struct nilfs_btree_path *path) { int level; @@ -101,26 +97,13 @@ static void nilfs_btree_init_path(const struct nilfs_btree *btree, } } -static void nilfs_btree_clear_path(const struct nilfs_btree *btree, - struct nilfs_btree_path *path) +static void nilfs_btree_release_path(struct nilfs_btree_path *path) { int level; - for (level = NILFS_BTREE_LEVEL_DATA; - level < NILFS_BTREE_LEVEL_MAX; - level++) { - if (path[level].bp_bh != NULL) { - brelse(path[level].bp_bh); - path[level].bp_bh = NULL; - } - /* sib_bh is released or deleted by prepare or commit - * operations. */ - path[level].bp_sib_bh = NULL; - path[level].bp_index = 0; - path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR; - path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR; - path[level].bp_op = NULL; - } + for (level = NILFS_BTREE_LEVEL_DATA; level < NILFS_BTREE_LEVEL_MAX; + level++) + brelse(path[level].bp_bh); } /* @@ -148,129 +131,110 @@ static int nilfs_btree_get_new_block(const struct nilfs_btree *btree, } static inline int -nilfs_btree_node_get_flags(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node) +nilfs_btree_node_get_flags(const struct nilfs_btree_node *node) { return node->bn_flags; } static inline void -nilfs_btree_node_set_flags(struct nilfs_btree *btree, - struct nilfs_btree_node *node, - int flags) +nilfs_btree_node_set_flags(struct nilfs_btree_node *node, int flags) { node->bn_flags = flags; } -static inline int nilfs_btree_node_root(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node) +static inline int nilfs_btree_node_root(const struct nilfs_btree_node *node) { - return nilfs_btree_node_get_flags(btree, node) & NILFS_BTREE_NODE_ROOT; + return nilfs_btree_node_get_flags(node) & NILFS_BTREE_NODE_ROOT; } static inline int -nilfs_btree_node_get_level(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node) +nilfs_btree_node_get_level(const struct nilfs_btree_node *node) { return node->bn_level; } static inline void -nilfs_btree_node_set_level(struct nilfs_btree *btree, - struct nilfs_btree_node *node, - int level) +nilfs_btree_node_set_level(struct nilfs_btree_node *node, int level) { node->bn_level = level; } static inline int -nilfs_btree_node_get_nchildren(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node) +nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node) { return le16_to_cpu(node->bn_nchildren); } static inline void -nilfs_btree_node_set_nchildren(struct nilfs_btree *btree, - struct nilfs_btree_node *node, - int nchildren) +nilfs_btree_node_set_nchildren(struct nilfs_btree_node *node, int nchildren) { node->bn_nchildren = cpu_to_le16(nchildren); } -static inline int -nilfs_btree_node_size(const struct nilfs_btree *btree) +static inline int nilfs_btree_node_size(const struct nilfs_btree *btree) { return 1 << btree->bt_bmap.b_inode->i_blkbits; } static inline int -nilfs_btree_node_nchildren_min(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node) +nilfs_btree_node_nchildren_min(const struct nilfs_btree_node *node, + const struct nilfs_btree *btree) { - return nilfs_btree_node_root(btree, node) ? + return nilfs_btree_node_root(node) ? NILFS_BTREE_ROOT_NCHILDREN_MIN : NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree)); } static inline int -nilfs_btree_node_nchildren_max(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node) +nilfs_btree_node_nchildren_max(const struct nilfs_btree_node *node, + const struct nilfs_btree *btree) { - return nilfs_btree_node_root(btree, node) ? + return nilfs_btree_node_root(node) ? NILFS_BTREE_ROOT_NCHILDREN_MAX : NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree)); } static inline __le64 * -nilfs_btree_node_dkeys(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node) +nilfs_btree_node_dkeys(const struct nilfs_btree_node *node) { return (__le64 *)((char *)(node + 1) + - (nilfs_btree_node_root(btree, node) ? + (nilfs_btree_node_root(node) ? 0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE)); } static inline __le64 * -nilfs_btree_node_dptrs(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node) +nilfs_btree_node_dptrs(const struct nilfs_btree_node *node, + const struct nilfs_btree *btree) { - return (__le64 *)(nilfs_btree_node_dkeys(btree, node) + - nilfs_btree_node_nchildren_max(btree, node)); + return (__le64 *)(nilfs_btree_node_dkeys(node) + + nilfs_btree_node_nchildren_max(node, btree)); } static inline __u64 -nilfs_btree_node_get_key(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node, int index) +nilfs_btree_node_get_key(const struct nilfs_btree_node *node, int index) { - return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(btree, node) + - index)); + return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(node) + index)); } static inline void -nilfs_btree_node_set_key(struct nilfs_btree *btree, - struct nilfs_btree_node *node, int index, __u64 key) +nilfs_btree_node_set_key(struct nilfs_btree_node *node, int index, __u64 key) { - *(nilfs_btree_node_dkeys(btree, node) + index) = - nilfs_bmap_key_to_dkey(key); + *(nilfs_btree_node_dkeys(node) + index) = nilfs_bmap_key_to_dkey(key); } static inline __u64 nilfs_btree_node_get_ptr(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node, - int index) + const struct nilfs_btree_node *node, int index) { - return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(btree, node) + + return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(node, btree) + index)); } static inline void nilfs_btree_node_set_ptr(struct nilfs_btree *btree, - struct nilfs_btree_node *node, - int index, - __u64 ptr) + struct nilfs_btree_node *node, int index, __u64 ptr) { - *(nilfs_btree_node_dptrs(btree, node) + index) = + *(nilfs_btree_node_dptrs(node, btree) + index) = nilfs_bmap_ptr_to_dptr(ptr); } @@ -283,12 +247,12 @@ static void nilfs_btree_node_init(struct nilfs_btree *btree, __le64 *dptrs; int i; - nilfs_btree_node_set_flags(btree, node, flags); - nilfs_btree_node_set_level(btree, node, level); - nilfs_btree_node_set_nchildren(btree, node, nchildren); + nilfs_btree_node_set_flags(node, flags); + nilfs_btree_node_set_level(node, level); + nilfs_btree_node_set_nchildren(node, nchildren); - dkeys = nilfs_btree_node_dkeys(btree, node); - dptrs = nilfs_btree_node_dptrs(btree, node); + dkeys = nilfs_btree_node_dkeys(node); + dptrs = nilfs_btree_node_dptrs(node, btree); for (i = 0; i < nchildren; i++) { dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]); dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]); @@ -305,13 +269,13 @@ static void nilfs_btree_node_move_left(struct nilfs_btree *btree, __le64 *ldptrs, *rdptrs; int lnchildren, rnchildren; - ldkeys = nilfs_btree_node_dkeys(btree, left); - ldptrs = nilfs_btree_node_dptrs(btree, left); - lnchildren = nilfs_btree_node_get_nchildren(btree, left); + ldkeys = nilfs_btree_node_dkeys(left); + ldptrs = nilfs_btree_node_dptrs(left, btree); + lnchildren = nilfs_btree_node_get_nchildren(left); - rdkeys = nilfs_btree_node_dkeys(btree, right); - rdptrs = nilfs_btree_node_dptrs(btree, right); - rnchildren = nilfs_btree_node_get_nchildren(btree, right); + rdkeys = nilfs_btree_node_dkeys(right); + rdptrs = nilfs_btree_node_dptrs(right, btree); + rnchildren = nilfs_btree_node_get_nchildren(right); memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys)); memcpy(ldptrs + lnchildren, rdptrs, n * sizeof(*rdptrs)); @@ -320,8 +284,8 @@ static void nilfs_btree_node_move_left(struct nilfs_btree *btree, lnchildren += n; rnchildren -= n; - nilfs_btree_node_set_nchildren(btree, left, lnchildren); - nilfs_btree_node_set_nchildren(btree, right, rnchildren); + nilfs_btree_node_set_nchildren(left, lnchildren); + nilfs_btree_node_set_nchildren(right, rnchildren); } /* Assume that the buffer heads corresponding to left and right are locked. */ @@ -334,13 +298,13 @@ static void nilfs_btree_node_move_right(struct nilfs_btree *btree, __le64 *ldptrs, *rdptrs; int lnchildren, rnchildren; - ldkeys = nilfs_btree_node_dkeys(btree, left); - ldptrs = nilfs_btree_node_dptrs(btree, left); - lnchildren = nilfs_btree_node_get_nchildren(btree, left); + ldkeys = nilfs_btree_node_dkeys(left); + ldptrs = nilfs_btree_node_dptrs(left, btree); + lnchildren = nilfs_btree_node_get_nchildren(left); - rdkeys = nilfs_btree_node_dkeys(btree, right); - rdptrs = nilfs_btree_node_dptrs(btree, right); - rnchildren = nilfs_btree_node_get_nchildren(btree, right); + rdkeys = nilfs_btree_node_dkeys(right); + rdptrs = nilfs_btree_node_dptrs(right, btree); + rnchildren = nilfs_btree_node_get_nchildren(right); memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys)); memmove(rdptrs + n, rdptrs, rnchildren * sizeof(*rdptrs)); @@ -349,8 +313,8 @@ static void nilfs_btree_node_move_right(struct nilfs_btree *btree, lnchildren -= n; rnchildren += n; - nilfs_btree_node_set_nchildren(btree, left, lnchildren); - nilfs_btree_node_set_nchildren(btree, right, rnchildren); + nilfs_btree_node_set_nchildren(left, lnchildren); + nilfs_btree_node_set_nchildren(right, rnchildren); } /* Assume that the buffer head corresponding to node is locked. */ @@ -362,9 +326,9 @@ static void nilfs_btree_node_insert(struct nilfs_btree *btree, __le64 *dptrs; int nchildren; - dkeys = nilfs_btree_node_dkeys(btree, node); - dptrs = nilfs_btree_node_dptrs(btree, node); - nchildren = nilfs_btree_node_get_nchildren(btree, node); + dkeys = nilfs_btree_node_dkeys(node); + dptrs = nilfs_btree_node_dptrs(node, btree); + nchildren = nilfs_btree_node_get_nchildren(node); if (index < nchildren) { memmove(dkeys + index + 1, dkeys + index, (nchildren - index) * sizeof(*dkeys)); @@ -374,7 +338,7 @@ static void nilfs_btree_node_insert(struct nilfs_btree *btree, dkeys[index] = nilfs_bmap_key_to_dkey(key); dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr); nchildren++; - nilfs_btree_node_set_nchildren(btree, node, nchildren); + nilfs_btree_node_set_nchildren(node, nchildren); } /* Assume that the buffer head corresponding to node is locked. */ @@ -388,11 +352,11 @@ static void nilfs_btree_node_delete(struct nilfs_btree *btree, __le64 *dptrs; int nchildren; - dkeys = nilfs_btree_node_dkeys(btree, node); - dptrs = nilfs_btree_node_dptrs(btree, node); + dkeys = nilfs_btree_node_dkeys(node); + dptrs = nilfs_btree_node_dptrs(node, btree); key = nilfs_bmap_dkey_to_key(dkeys[index]); ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]); - nchildren = nilfs_btree_node_get_nchildren(btree, node); + nchildren = nilfs_btree_node_get_nchildren(node); if (keyp != NULL) *keyp = key; if (ptrp != NULL) @@ -405,11 +369,10 @@ static void nilfs_btree_node_delete(struct nilfs_btree *btree, (nchildren - index - 1) * sizeof(*dptrs)); } nchildren--; - nilfs_btree_node_set_nchildren(btree, node, nchildren); + nilfs_btree_node_set_nchildren(node, nchildren); } -static int nilfs_btree_node_lookup(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node, +static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node, __u64 key, int *indexp) { __u64 nkey; @@ -417,12 +380,12 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree *btree, /* binary search */ low = 0; - high = nilfs_btree_node_get_nchildren(btree, node) - 1; + high = nilfs_btree_node_get_nchildren(node) - 1; index = 0; s = 0; while (low <= high) { index = (low + high) / 2; - nkey = nilfs_btree_node_get_key(btree, node, index); + nkey = nilfs_btree_node_get_key(node, index); if (nkey == key) { s = 0; goto out; @@ -436,9 +399,8 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree *btree, } /* adjust index */ - if (nilfs_btree_node_get_level(btree, node) > - NILFS_BTREE_LEVEL_NODE_MIN) { - if ((s > 0) && (index > 0)) + if (nilfs_btree_node_get_level(node) > NILFS_BTREE_LEVEL_NODE_MIN) { + if (s > 0 && index > 0) index--; } else if (s < 0) index++; @@ -456,25 +418,20 @@ nilfs_btree_get_root(const struct nilfs_btree *btree) } static inline struct nilfs_btree_node * -nilfs_btree_get_nonroot_node(const struct nilfs_btree *btree, - const struct nilfs_btree_path *path, - int level) +nilfs_btree_get_nonroot_node(const struct nilfs_btree_path *path, int level) { return (struct nilfs_btree_node *)path[level].bp_bh->b_data; } static inline struct nilfs_btree_node * -nilfs_btree_get_sib_node(const struct nilfs_btree *btree, - const struct nilfs_btree_path *path, - int level) +nilfs_btree_get_sib_node(const struct nilfs_btree_path *path, int level) { return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data; } static inline int nilfs_btree_height(const struct nilfs_btree *btree) { - return nilfs_btree_node_get_level(btree, nilfs_btree_get_root(btree)) - + 1; + return nilfs_btree_node_get_level(nilfs_btree_get_root(btree)) + 1; } static inline struct nilfs_btree_node * @@ -484,7 +441,7 @@ nilfs_btree_get_node(const struct nilfs_btree *btree, { return (level == nilfs_btree_height(btree) - 1) ? nilfs_btree_get_root(btree) : - nilfs_btree_get_nonroot_node(btree, path, level); + nilfs_btree_get_nonroot_node(path, level); } static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, @@ -496,12 +453,11 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, int level, index, found, ret; node = nilfs_btree_get_root(btree); - level = nilfs_btree_node_get_level(btree, node); - if ((level < minlevel) || - (nilfs_btree_node_get_nchildren(btree, node) <= 0)) + level = nilfs_btree_node_get_level(node); + if (level < minlevel || nilfs_btree_node_get_nchildren(node) <= 0) return -ENOENT; - found = nilfs_btree_node_lookup(btree, node, key, &index); + found = nilfs_btree_node_lookup(node, key, &index); ptr = nilfs_btree_node_get_ptr(btree, node, index); path[level].bp_bh = NULL; path[level].bp_index = index; @@ -510,14 +466,13 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh); if (ret < 0) return ret; - node = nilfs_btree_get_nonroot_node(btree, path, level); - BUG_ON(level != nilfs_btree_node_get_level(btree, node)); + node = nilfs_btree_get_nonroot_node(path, level); + BUG_ON(level != nilfs_btree_node_get_level(node)); if (!found) - found = nilfs_btree_node_lookup(btree, node, key, - &index); + found = nilfs_btree_node_lookup(node, key, &index); else index = 0; - if (index < nilfs_btree_node_nchildren_max(btree, node)) + if (index < nilfs_btree_node_nchildren_max(node, btree)) ptr = nilfs_btree_node_get_ptr(btree, node, index); else { WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN); @@ -544,10 +499,10 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree, int index, level, ret; node = nilfs_btree_get_root(btree); - index = nilfs_btree_node_get_nchildren(btree, node) - 1; + index = nilfs_btree_node_get_nchildren(node) - 1; if (index < 0) return -ENOENT; - level = nilfs_btree_node_get_level(btree, node); + level = nilfs_btree_node_get_level(node); ptr = nilfs_btree_node_get_ptr(btree, node, index); path[level].bp_bh = NULL; path[level].bp_index = index; @@ -556,15 +511,15 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree, ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh); if (ret < 0) return ret; - node = nilfs_btree_get_nonroot_node(btree, path, level); - BUG_ON(level != nilfs_btree_node_get_level(btree, node)); - index = nilfs_btree_node_get_nchildren(btree, node) - 1; + node = nilfs_btree_get_nonroot_node(path, level); + BUG_ON(level != nilfs_btree_node_get_level(node)); + index = nilfs_btree_node_get_nchildren(node) - 1; ptr = nilfs_btree_node_get_ptr(btree, node, index); path[level].bp_index = index; } if (keyp != NULL) - *keyp = nilfs_btree_node_get_key(btree, node, index); + *keyp = nilfs_btree_node_get_key(node, index); if (ptrp != NULL) *ptrp = ptr; @@ -580,18 +535,18 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *bmap, int ret; btree = (struct nilfs_btree *)bmap; - path = nilfs_btree_alloc_path(btree); + path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(btree, path); + nilfs_btree_init_path(path); ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); if (ptrp != NULL) *ptrp = ptr; - nilfs_btree_clear_path(btree, path); - nilfs_btree_free_path(btree, path); + nilfs_btree_release_path(path); + nilfs_btree_free_path(path); return ret; } @@ -608,10 +563,10 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, int level = NILFS_BTREE_LEVEL_NODE_MIN; int ret, cnt, index, maxlevel; - path = nilfs_btree_alloc_path(btree); + path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(btree, path); + nilfs_btree_init_path(path); ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); if (ret < 0) goto out; @@ -631,8 +586,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, node = nilfs_btree_get_node(btree, path, level); index = path[level].bp_index + 1; for (;;) { - while (index < nilfs_btree_node_get_nchildren(btree, node)) { - if (nilfs_btree_node_get_key(btree, node, index) != + while (index < nilfs_btree_node_get_nchildren(node)) { + if (nilfs_btree_node_get_key(node, index) != key + cnt) goto end; ptr2 = nilfs_btree_node_get_ptr(btree, node, index); @@ -653,8 +608,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, /* look-up right sibling node */ node = nilfs_btree_get_node(btree, path, level + 1); index = path[level + 1].bp_index + 1; - if (index >= nilfs_btree_node_get_nchildren(btree, node) || - nilfs_btree_node_get_key(btree, node, index) != key + cnt) + if (index >= nilfs_btree_node_get_nchildren(node) || + nilfs_btree_node_get_key(node, index) != key + cnt) break; ptr2 = nilfs_btree_node_get_ptr(btree, node, index); path[level + 1].bp_index = index; @@ -664,7 +619,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, ret = nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh); if (ret < 0) goto out; - node = nilfs_btree_get_nonroot_node(btree, path, level); + node = nilfs_btree_get_nonroot_node(path, level); index = 0; path[level].bp_index = index; } @@ -672,8 +627,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, *ptrp = ptr; ret = cnt; out: - nilfs_btree_clear_path(btree, path); - nilfs_btree_free_path(btree, path); + nilfs_btree_release_path(path); + nilfs_btree_free_path(path); return ret; } @@ -685,9 +640,7 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree, do { lock_buffer(path[level].bp_bh); nilfs_btree_node_set_key( - btree, - nilfs_btree_get_nonroot_node( - btree, path, level), + nilfs_btree_get_nonroot_node(path, level), path[level].bp_index, key); if (!buffer_dirty(path[level].bp_bh)) nilfs_btnode_mark_dirty(path[level].bp_bh); @@ -698,8 +651,7 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree, /* root */ if (level == nilfs_btree_height(btree) - 1) { - nilfs_btree_node_set_key(btree, - nilfs_btree_get_root(btree), + nilfs_btree_node_set_key(nilfs_btree_get_root(btree), path[level].bp_index, key); } } @@ -712,7 +664,7 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree, if (level < nilfs_btree_height(btree) - 1) { lock_buffer(path[level].bp_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); + node = nilfs_btree_get_nonroot_node(path, level); nilfs_btree_node_insert(btree, node, *keyp, *ptrp, path[level].bp_index); if (!buffer_dirty(path[level].bp_bh)) @@ -721,8 +673,8 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree, if (path[level].bp_index == 0) nilfs_btree_promote_key(btree, path, level + 1, - nilfs_btree_node_get_key( - btree, node, 0)); + nilfs_btree_node_get_key(node, + 0)); } else { node = nilfs_btree_get_root(btree); nilfs_btree_node_insert(btree, node, *keyp, *ptrp, @@ -740,10 +692,10 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree, lock_buffer(path[level].bp_bh); lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); - left = nilfs_btree_get_sib_node(btree, path, level); - nchildren = nilfs_btree_node_get_nchildren(btree, node); - lnchildren = nilfs_btree_node_get_nchildren(btree, left); + node = nilfs_btree_get_nonroot_node(path, level); + left = nilfs_btree_get_sib_node(path, level); + nchildren = nilfs_btree_node_get_nchildren(node); + lnchildren = nilfs_btree_node_get_nchildren(left); move = 0; n = (nchildren + lnchildren + 1) / 2 - lnchildren; @@ -764,7 +716,7 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree, unlock_buffer(path[level].bp_sib_bh); nilfs_btree_promote_key(btree, path, level + 1, - nilfs_btree_node_get_key(btree, node, 0)); + nilfs_btree_node_get_key(node, 0)); if (move) { brelse(path[level].bp_bh); @@ -791,10 +743,10 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree, lock_buffer(path[level].bp_bh); lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); - right = nilfs_btree_get_sib_node(btree, path, level); - nchildren = nilfs_btree_node_get_nchildren(btree, node); - rnchildren = nilfs_btree_node_get_nchildren(btree, right); + node = nilfs_btree_get_nonroot_node(path, level); + right = nilfs_btree_get_sib_node(path, level); + nchildren = nilfs_btree_node_get_nchildren(node); + rnchildren = nilfs_btree_node_get_nchildren(right); move = 0; n = (nchildren + rnchildren + 1) / 2 - rnchildren; @@ -816,15 +768,14 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree, path[level + 1].bp_index++; nilfs_btree_promote_key(btree, path, level + 1, - nilfs_btree_node_get_key(btree, right, 0)); + nilfs_btree_node_get_key(right, 0)); path[level + 1].bp_index--; if (move) { brelse(path[level].bp_bh); path[level].bp_bh = path[level].bp_sib_bh; path[level].bp_sib_bh = NULL; - path[level].bp_index -= - nilfs_btree_node_get_nchildren(btree, node); + path[level].bp_index -= nilfs_btree_node_get_nchildren(node); path[level + 1].bp_index++; } else { brelse(path[level].bp_sib_bh); @@ -846,9 +797,9 @@ static void nilfs_btree_split(struct nilfs_btree *btree, lock_buffer(path[level].bp_bh); lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); - right = nilfs_btree_get_sib_node(btree, path, level); - nchildren = nilfs_btree_node_get_nchildren(btree, node); + node = nilfs_btree_get_nonroot_node(path, level); + right = nilfs_btree_get_sib_node(path, level); + nchildren = nilfs_btree_node_get_nchildren(node); move = 0; n = (nchildren + 1) / 2; @@ -867,16 +818,15 @@ static void nilfs_btree_split(struct nilfs_btree *btree, unlock_buffer(path[level].bp_bh); unlock_buffer(path[level].bp_sib_bh); - newkey = nilfs_btree_node_get_key(btree, right, 0); + newkey = nilfs_btree_node_get_key(right, 0); newptr = path[level].bp_newreq.bpr_ptr; if (move) { - path[level].bp_index -= - nilfs_btree_node_get_nchildren(btree, node); + path[level].bp_index -= nilfs_btree_node_get_nchildren(node); nilfs_btree_node_insert(btree, right, *keyp, *ptrp, path[level].bp_index); - *keyp = nilfs_btree_node_get_key(btree, right, 0); + *keyp = nilfs_btree_node_get_key(right, 0); *ptrp = path[level].bp_newreq.bpr_ptr; brelse(path[level].bp_bh); @@ -885,7 +835,7 @@ static void nilfs_btree_split(struct nilfs_btree *btree, } else { nilfs_btree_do_insert(btree, path, level, keyp, ptrp); - *keyp = nilfs_btree_node_get_key(btree, right, 0); + *keyp = nilfs_btree_node_get_key(right, 0); *ptrp = path[level].bp_newreq.bpr_ptr; brelse(path[level].bp_sib_bh); @@ -905,12 +855,12 @@ static void nilfs_btree_grow(struct nilfs_btree *btree, lock_buffer(path[level].bp_sib_bh); root = nilfs_btree_get_root(btree); - child = nilfs_btree_get_sib_node(btree, path, level); + child = nilfs_btree_get_sib_node(path, level); - n = nilfs_btree_node_get_nchildren(btree, root); + n = nilfs_btree_node_get_nchildren(root); nilfs_btree_node_move_right(btree, root, child, n); - nilfs_btree_node_set_level(btree, root, level + 1); + nilfs_btree_node_set_level(root, level + 1); if (!buffer_dirty(path[level].bp_sib_bh)) nilfs_btnode_mark_dirty(path[level].bp_sib_bh); @@ -922,7 +872,7 @@ static void nilfs_btree_grow(struct nilfs_btree *btree, nilfs_btree_do_insert(btree, path, level, keyp, ptrp); - *keyp = nilfs_btree_node_get_key(btree, child, 0); + *keyp = nilfs_btree_node_get_key(child, 0); *ptrp = path[level].bp_newreq.bpr_ptr; } @@ -990,26 +940,29 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, struct nilfs_btree_node *node, *parent, *sib; __u64 sibptr; int pindex, level, ret; + struct inode *dat = NULL; stats->bs_nblocks = 0; level = NILFS_BTREE_LEVEL_DATA; /* allocate a new ptr for data block */ - if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) + if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) { path[level].bp_newreq.bpr_ptr = nilfs_btree_find_target_v(btree, path, key); + dat = nilfs_bmap_get_dat(&btree->bt_bmap); + } ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, - &path[level].bp_newreq); + &path[level].bp_newreq, dat); if (ret < 0) goto err_out_data; for (level = NILFS_BTREE_LEVEL_NODE_MIN; level < nilfs_btree_height(btree) - 1; level++) { - node = nilfs_btree_get_nonroot_node(btree, path, level); - if (nilfs_btree_node_get_nchildren(btree, node) < - nilfs_btree_node_nchildren_max(btree, node)) { + node = nilfs_btree_get_nonroot_node(path, level); + if (nilfs_btree_node_get_nchildren(node) < + nilfs_btree_node_nchildren_max(node, btree)) { path[level].bp_op = nilfs_btree_do_insert; stats->bs_nblocks++; goto out; @@ -1026,8 +979,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, if (ret < 0) goto err_out_child_node; sib = (struct nilfs_btree_node *)bh->b_data; - if (nilfs_btree_node_get_nchildren(btree, sib) < - nilfs_btree_node_nchildren_max(btree, sib)) { + if (nilfs_btree_node_get_nchildren(sib) < + nilfs_btree_node_nchildren_max(sib, btree)) { path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_carry_left; stats->bs_nblocks++; @@ -1038,15 +991,15 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, /* right sibling */ if (pindex < - nilfs_btree_node_get_nchildren(btree, parent) - 1) { + nilfs_btree_node_get_nchildren(parent) - 1) { sibptr = nilfs_btree_node_get_ptr(btree, parent, pindex + 1); ret = nilfs_btree_get_block(btree, sibptr, &bh); if (ret < 0) goto err_out_child_node; sib = (struct nilfs_btree_node *)bh->b_data; - if (nilfs_btree_node_get_nchildren(btree, sib) < - nilfs_btree_node_nchildren_max(btree, sib)) { + if (nilfs_btree_node_get_nchildren(sib) < + nilfs_btree_node_nchildren_max(sib, btree)) { path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_carry_right; stats->bs_nblocks++; @@ -1059,7 +1012,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1; ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, - &path[level].bp_newreq); + &path[level].bp_newreq, dat); if (ret < 0) goto err_out_child_node; ret = nilfs_btree_get_new_block(btree, @@ -1081,8 +1034,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, /* root */ node = nilfs_btree_get_root(btree); - if (nilfs_btree_node_get_nchildren(btree, node) < - nilfs_btree_node_nchildren_max(btree, node)) { + if (nilfs_btree_node_get_nchildren(node) < + nilfs_btree_node_nchildren_max(node, btree)) { path[level].bp_op = nilfs_btree_do_insert; stats->bs_nblocks++; goto out; @@ -1091,7 +1044,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, /* grow */ path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1; ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, - &path[level].bp_newreq); + &path[level].bp_newreq, dat); if (ret < 0) goto err_out_child_node; ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr, @@ -1119,16 +1072,18 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, /* error */ err_out_curr_node: - nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq); + nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq, + dat); err_out_child_node: for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) { nilfs_btnode_delete(path[level].bp_sib_bh); nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, - &path[level].bp_newreq); + &path[level].bp_newreq, dat); } - nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq); + nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq, + dat); err_out_data: *levelp = level; stats->bs_nblocks = 0; @@ -1139,16 +1094,19 @@ static void nilfs_btree_commit_insert(struct nilfs_btree *btree, struct nilfs_btree_path *path, int maxlevel, __u64 key, __u64 ptr) { + struct inode *dat = NULL; int level; set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr; - if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) + if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) { nilfs_btree_set_target_v(btree, key, ptr); + dat = nilfs_bmap_get_dat(&btree->bt_bmap); + } for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { nilfs_bmap_commit_alloc_ptr(&btree->bt_bmap, - &path[level - 1].bp_newreq); + &path[level - 1].bp_newreq, dat); path[level].bp_op(btree, path, level, &key, &ptr); } @@ -1164,10 +1122,10 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) int level, ret; btree = (struct nilfs_btree *)bmap; - path = nilfs_btree_alloc_path(btree); + path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(btree, path); + nilfs_btree_init_path(path); ret = nilfs_btree_do_lookup(btree, path, key, NULL, NILFS_BTREE_LEVEL_NODE_MIN); @@ -1184,8 +1142,8 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); out: - nilfs_btree_clear_path(btree, path); - nilfs_btree_free_path(btree, path); + nilfs_btree_release_path(path); + nilfs_btree_free_path(path); return ret; } @@ -1197,7 +1155,7 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree, if (level < nilfs_btree_height(btree) - 1) { lock_buffer(path[level].bp_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); + node = nilfs_btree_get_nonroot_node(path, level); nilfs_btree_node_delete(btree, node, keyp, ptrp, path[level].bp_index); if (!buffer_dirty(path[level].bp_bh)) @@ -1205,7 +1163,7 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree, unlock_buffer(path[level].bp_bh); if (path[level].bp_index == 0) nilfs_btree_promote_key(btree, path, level + 1, - nilfs_btree_node_get_key(btree, node, 0)); + nilfs_btree_node_get_key(node, 0)); } else { node = nilfs_btree_get_root(btree); nilfs_btree_node_delete(btree, node, keyp, ptrp, @@ -1225,10 +1183,10 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree, lock_buffer(path[level].bp_bh); lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); - left = nilfs_btree_get_sib_node(btree, path, level); - nchildren = nilfs_btree_node_get_nchildren(btree, node); - lnchildren = nilfs_btree_node_get_nchildren(btree, left); + node = nilfs_btree_get_nonroot_node(path, level); + left = nilfs_btree_get_sib_node(path, level); + nchildren = nilfs_btree_node_get_nchildren(node); + lnchildren = nilfs_btree_node_get_nchildren(left); n = (nchildren + lnchildren) / 2 - nchildren; @@ -1243,7 +1201,7 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree, unlock_buffer(path[level].bp_sib_bh); nilfs_btree_promote_key(btree, path, level + 1, - nilfs_btree_node_get_key(btree, node, 0)); + nilfs_btree_node_get_key(node, 0)); brelse(path[level].bp_sib_bh); path[level].bp_sib_bh = NULL; @@ -1262,10 +1220,10 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree, lock_buffer(path[level].bp_bh); lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); - right = nilfs_btree_get_sib_node(btree, path, level); - nchildren = nilfs_btree_node_get_nchildren(btree, node); - rnchildren = nilfs_btree_node_get_nchildren(btree, right); + node = nilfs_btree_get_nonroot_node(path, level); + right = nilfs_btree_get_sib_node(path, level); + nchildren = nilfs_btree_node_get_nchildren(node); + rnchildren = nilfs_btree_node_get_nchildren(right); n = (nchildren + rnchildren) / 2 - nchildren; @@ -1281,7 +1239,7 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree, path[level + 1].bp_index++; nilfs_btree_promote_key(btree, path, level + 1, - nilfs_btree_node_get_key(btree, right, 0)); + nilfs_btree_node_get_key(right, 0)); path[level + 1].bp_index--; brelse(path[level].bp_sib_bh); @@ -1300,10 +1258,10 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree, lock_buffer(path[level].bp_bh); lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); - left = nilfs_btree_get_sib_node(btree, path, level); + node = nilfs_btree_get_nonroot_node(path, level); + left = nilfs_btree_get_sib_node(path, level); - n = nilfs_btree_node_get_nchildren(btree, node); + n = nilfs_btree_node_get_nchildren(node); nilfs_btree_node_move_left(btree, left, node, n); @@ -1316,7 +1274,7 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree, nilfs_btnode_delete(path[level].bp_bh); path[level].bp_bh = path[level].bp_sib_bh; path[level].bp_sib_bh = NULL; - path[level].bp_index += nilfs_btree_node_get_nchildren(btree, left); + path[level].bp_index += nilfs_btree_node_get_nchildren(left); } static void nilfs_btree_concat_right(struct nilfs_btree *btree, @@ -1331,10 +1289,10 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree, lock_buffer(path[level].bp_bh); lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); - right = nilfs_btree_get_sib_node(btree, path, level); + node = nilfs_btree_get_nonroot_node(path, level); + right = nilfs_btree_get_sib_node(path, level); - n = nilfs_btree_node_get_nchildren(btree, right); + n = nilfs_btree_node_get_nchildren(right); nilfs_btree_node_move_left(btree, node, right, n); @@ -1360,11 +1318,11 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree, lock_buffer(path[level].bp_bh); root = nilfs_btree_get_root(btree); - child = nilfs_btree_get_nonroot_node(btree, path, level); + child = nilfs_btree_get_nonroot_node(path, level); nilfs_btree_node_delete(btree, root, NULL, NULL, 0); - nilfs_btree_node_set_level(btree, root, level); - n = nilfs_btree_node_get_nchildren(btree, child); + nilfs_btree_node_set_level(root, level); + n = nilfs_btree_node_get_nchildren(child); nilfs_btree_node_move_left(btree, root, child, n); unlock_buffer(path[level].bp_bh); @@ -1376,7 +1334,8 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree, static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, struct nilfs_btree_path *path, int *levelp, - struct nilfs_bmap_stats *stats) + struct nilfs_bmap_stats *stats, + struct inode *dat) { struct buffer_head *bh; struct nilfs_btree_node *node, *parent, *sib; @@ -1388,17 +1347,17 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, for (level = NILFS_BTREE_LEVEL_NODE_MIN; level < nilfs_btree_height(btree) - 1; level++) { - node = nilfs_btree_get_nonroot_node(btree, path, level); + node = nilfs_btree_get_nonroot_node(path, level); path[level].bp_oldreq.bpr_ptr = nilfs_btree_node_get_ptr(btree, node, path[level].bp_index); ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap, - &path[level].bp_oldreq); + &path[level].bp_oldreq, dat); if (ret < 0) goto err_out_child_node; - if (nilfs_btree_node_get_nchildren(btree, node) > - nilfs_btree_node_nchildren_min(btree, node)) { + if (nilfs_btree_node_get_nchildren(node) > + nilfs_btree_node_nchildren_min(node, btree)) { path[level].bp_op = nilfs_btree_do_delete; stats->bs_nblocks++; goto out; @@ -1415,8 +1374,8 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, if (ret < 0) goto err_out_curr_node; sib = (struct nilfs_btree_node *)bh->b_data; - if (nilfs_btree_node_get_nchildren(btree, sib) > - nilfs_btree_node_nchildren_min(btree, sib)) { + if (nilfs_btree_node_get_nchildren(sib) > + nilfs_btree_node_nchildren_min(sib, btree)) { path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_borrow_left; stats->bs_nblocks++; @@ -1428,7 +1387,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, /* continue; */ } } else if (pindex < - nilfs_btree_node_get_nchildren(btree, parent) - 1) { + nilfs_btree_node_get_nchildren(parent) - 1) { /* right sibling */ sibptr = nilfs_btree_node_get_ptr(btree, parent, pindex + 1); @@ -1436,8 +1395,8 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, if (ret < 0) goto err_out_curr_node; sib = (struct nilfs_btree_node *)bh->b_data; - if (nilfs_btree_node_get_nchildren(btree, sib) > - nilfs_btree_node_nchildren_min(btree, sib)) { + if (nilfs_btree_node_get_nchildren(sib) > + nilfs_btree_node_nchildren_min(sib, btree)) { path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_borrow_right; stats->bs_nblocks++; @@ -1452,7 +1411,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, /* no siblings */ /* the only child of the root node */ WARN_ON(level != nilfs_btree_height(btree) - 2); - if (nilfs_btree_node_get_nchildren(btree, node) - 1 <= + if (nilfs_btree_node_get_nchildren(node) - 1 <= NILFS_BTREE_ROOT_NCHILDREN_MAX) { path[level].bp_op = nilfs_btree_shrink; stats->bs_nblocks += 2; @@ -1471,7 +1430,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, nilfs_btree_node_get_ptr(btree, node, path[level].bp_index); ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap, - &path[level].bp_oldreq); + &path[level].bp_oldreq, dat); if (ret < 0) goto err_out_child_node; @@ -1486,12 +1445,12 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, /* error */ err_out_curr_node: - nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq); + nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq, dat); err_out_child_node: for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) { brelse(path[level].bp_sib_bh); nilfs_bmap_abort_end_ptr(&btree->bt_bmap, - &path[level].bp_oldreq); + &path[level].bp_oldreq, dat); } *levelp = level; stats->bs_nblocks = 0; @@ -1500,13 +1459,13 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, static void nilfs_btree_commit_delete(struct nilfs_btree *btree, struct nilfs_btree_path *path, - int maxlevel) + int maxlevel, struct inode *dat) { int level; for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { nilfs_bmap_commit_end_ptr(&btree->bt_bmap, - &path[level].bp_oldreq); + &path[level].bp_oldreq, dat); path[level].bp_op(btree, path, level, NULL, NULL); } @@ -1520,27 +1479,32 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key) struct nilfs_btree *btree; struct nilfs_btree_path *path; struct nilfs_bmap_stats stats; + struct inode *dat; int level, ret; btree = (struct nilfs_btree *)bmap; - path = nilfs_btree_alloc_path(btree); + path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(btree, path); + nilfs_btree_init_path(path); ret = nilfs_btree_do_lookup(btree, path, key, NULL, NILFS_BTREE_LEVEL_NODE_MIN); if (ret < 0) goto out; - ret = nilfs_btree_prepare_delete(btree, path, &level, &stats); + + dat = NILFS_BMAP_USE_VBN(&btree->bt_bmap) ? + nilfs_bmap_get_dat(&btree->bt_bmap) : NULL; + + ret = nilfs_btree_prepare_delete(btree, path, &level, &stats, dat); if (ret < 0) goto out; - nilfs_btree_commit_delete(btree, path, level); + nilfs_btree_commit_delete(btree, path, level, dat); nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks); out: - nilfs_btree_clear_path(btree, path); - nilfs_btree_free_path(btree, path); + nilfs_btree_release_path(path); + nilfs_btree_free_path(path); return ret; } @@ -1551,15 +1515,15 @@ static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp) int ret; btree = (struct nilfs_btree *)bmap; - path = nilfs_btree_alloc_path(btree); + path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(btree, path); + nilfs_btree_init_path(path); ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL); - nilfs_btree_clear_path(btree, path); - nilfs_btree_free_path(btree, path); + nilfs_btree_release_path(path); + nilfs_btree_free_path(path); return ret; } @@ -1581,7 +1545,7 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key) node = root; break; case 3: - nchildren = nilfs_btree_node_get_nchildren(btree, root); + nchildren = nilfs_btree_node_get_nchildren(root); if (nchildren > 1) return 0; ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1); @@ -1594,10 +1558,10 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key) return 0; } - nchildren = nilfs_btree_node_get_nchildren(btree, node); - maxkey = nilfs_btree_node_get_key(btree, node, nchildren - 1); + nchildren = nilfs_btree_node_get_nchildren(node); + maxkey = nilfs_btree_node_get_key(node, nchildren - 1); nextmaxkey = (nchildren > 1) ? - nilfs_btree_node_get_key(btree, node, nchildren - 2) : 0; + nilfs_btree_node_get_key(node, nchildren - 2) : 0; if (bh != NULL) brelse(bh); @@ -1623,7 +1587,7 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap, node = root; break; case 3: - nchildren = nilfs_btree_node_get_nchildren(btree, root); + nchildren = nilfs_btree_node_get_nchildren(root); WARN_ON(nchildren > 1); ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1); ret = nilfs_btree_get_block(btree, ptr, &bh); @@ -1636,11 +1600,11 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap, return -EINVAL; } - nchildren = nilfs_btree_node_get_nchildren(btree, node); + nchildren = nilfs_btree_node_get_nchildren(node); if (nchildren < nitems) nitems = nchildren; - dkeys = nilfs_btree_node_dkeys(btree, node); - dptrs = nilfs_btree_node_dptrs(btree, node); + dkeys = nilfs_btree_node_dkeys(node); + dptrs = nilfs_btree_node_dptrs(node, btree); for (i = 0; i < nitems; i++) { keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]); ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]); @@ -1660,18 +1624,20 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key, struct nilfs_bmap_stats *stats) { struct buffer_head *bh; - struct nilfs_btree *btree; + struct nilfs_btree *btree = (struct nilfs_btree *)bmap; + struct inode *dat = NULL; int ret; - btree = (struct nilfs_btree *)bmap; stats->bs_nblocks = 0; /* for data */ /* cannot find near ptr */ - if (NILFS_BMAP_USE_VBN(bmap)) + if (NILFS_BMAP_USE_VBN(bmap)) { dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key); + dat = nilfs_bmap_get_dat(bmap); + } - ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq); + ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq, dat); if (ret < 0) return ret; @@ -1679,7 +1645,7 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key, stats->bs_nblocks++; if (nreq != NULL) { nreq->bpr_ptr = dreq->bpr_ptr + 1; - ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq); + ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq, dat); if (ret < 0) goto err_out_dreq; @@ -1696,9 +1662,9 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key, /* error */ err_out_nreq: - nilfs_bmap_abort_alloc_ptr(bmap, nreq); + nilfs_bmap_abort_alloc_ptr(bmap, nreq, dat); err_out_dreq: - nilfs_bmap_abort_alloc_ptr(bmap, dreq); + nilfs_bmap_abort_alloc_ptr(bmap, dreq, dat); stats->bs_nblocks = 0; return ret; @@ -1713,8 +1679,9 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, union nilfs_bmap_ptr_req *nreq, struct buffer_head *bh) { - struct nilfs_btree *btree; + struct nilfs_btree *btree = (struct nilfs_btree *)bmap; struct nilfs_btree_node *node; + struct inode *dat; __u64 tmpptr; /* free resources */ @@ -1725,11 +1692,11 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); /* convert and insert */ - btree = (struct nilfs_btree *)bmap; + dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL; nilfs_btree_init(bmap); if (nreq != NULL) { - nilfs_bmap_commit_alloc_ptr(bmap, dreq); - nilfs_bmap_commit_alloc_ptr(bmap, nreq); + nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat); + nilfs_bmap_commit_alloc_ptr(bmap, nreq, dat); /* create child node at level 1 */ lock_buffer(bh); @@ -1751,7 +1718,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT, 2, 1, &keys[0], &tmpptr); } else { - nilfs_bmap_commit_alloc_ptr(bmap, dreq); + nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat); /* create root node at level 1 */ node = nilfs_btree_get_root(btree); @@ -1822,7 +1789,7 @@ static int nilfs_btree_propagate_p(struct nilfs_btree *btree, static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree, struct nilfs_btree_path *path, - int level) + int level, struct inode *dat) { struct nilfs_btree_node *parent; int ret; @@ -1832,9 +1799,8 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree, nilfs_btree_node_get_ptr(btree, parent, path[level + 1].bp_index); path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1; - ret = nilfs_bmap_prepare_update_v(&btree->bt_bmap, - &path[level].bp_oldreq, - &path[level].bp_newreq); + ret = nilfs_dat_prepare_update(dat, &path[level].bp_oldreq.bpr_req, + &path[level].bp_newreq.bpr_req); if (ret < 0) return ret; @@ -1846,9 +1812,9 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree, &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, &path[level].bp_ctxt); if (ret < 0) { - nilfs_bmap_abort_update_v(&btree->bt_bmap, - &path[level].bp_oldreq, - &path[level].bp_newreq); + nilfs_dat_abort_update(dat, + &path[level].bp_oldreq.bpr_req, + &path[level].bp_newreq.bpr_req); return ret; } } @@ -1858,13 +1824,13 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree, static void nilfs_btree_commit_update_v(struct nilfs_btree *btree, struct nilfs_btree_path *path, - int level) + int level, struct inode *dat) { struct nilfs_btree_node *parent; - nilfs_bmap_commit_update_v(&btree->bt_bmap, - &path[level].bp_oldreq, - &path[level].bp_newreq); + nilfs_dat_commit_update(dat, &path[level].bp_oldreq.bpr_req, + &path[level].bp_newreq.bpr_req, + btree->bt_bmap.b_ptr_type == NILFS_BMAP_PTR_VS); if (buffer_nilfs_node(path[level].bp_bh)) { nilfs_btnode_commit_change_key( @@ -1881,11 +1847,10 @@ static void nilfs_btree_commit_update_v(struct nilfs_btree *btree, static void nilfs_btree_abort_update_v(struct nilfs_btree *btree, struct nilfs_btree_path *path, - int level) + int level, struct inode *dat) { - nilfs_bmap_abort_update_v(&btree->bt_bmap, - &path[level].bp_oldreq, - &path[level].bp_newreq); + nilfs_dat_abort_update(dat, &path[level].bp_oldreq.bpr_req, + &path[level].bp_newreq.bpr_req); if (buffer_nilfs_node(path[level].bp_bh)) nilfs_btnode_abort_change_key( &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, @@ -1894,14 +1859,14 @@ static void nilfs_btree_abort_update_v(struct nilfs_btree *btree, static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree, struct nilfs_btree_path *path, - int minlevel, - int *maxlevelp) + int minlevel, int *maxlevelp, + struct inode *dat) { int level, ret; level = minlevel; if (!buffer_nilfs_volatile(path[level].bp_bh)) { - ret = nilfs_btree_prepare_update_v(btree, path, level); + ret = nilfs_btree_prepare_update_v(btree, path, level, dat); if (ret < 0) return ret; } @@ -1909,7 +1874,7 @@ static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree, !buffer_dirty(path[level].bp_bh)) { WARN_ON(buffer_nilfs_volatile(path[level].bp_bh)); - ret = nilfs_btree_prepare_update_v(btree, path, level); + ret = nilfs_btree_prepare_update_v(btree, path, level, dat); if (ret < 0) goto out; } @@ -1921,39 +1886,40 @@ static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree, /* error */ out: while (--level > minlevel) - nilfs_btree_abort_update_v(btree, path, level); + nilfs_btree_abort_update_v(btree, path, level, dat); if (!buffer_nilfs_volatile(path[level].bp_bh)) - nilfs_btree_abort_update_v(btree, path, level); + nilfs_btree_abort_update_v(btree, path, level, dat); return ret; } static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree, struct nilfs_btree_path *path, - int minlevel, - int maxlevel, - struct buffer_head *bh) + int minlevel, int maxlevel, + struct buffer_head *bh, + struct inode *dat) { int level; if (!buffer_nilfs_volatile(path[minlevel].bp_bh)) - nilfs_btree_commit_update_v(btree, path, minlevel); + nilfs_btree_commit_update_v(btree, path, minlevel, dat); for (level = minlevel + 1; level <= maxlevel; level++) - nilfs_btree_commit_update_v(btree, path, level); + nilfs_btree_commit_update_v(btree, path, level, dat); } static int nilfs_btree_propagate_v(struct nilfs_btree *btree, struct nilfs_btree_path *path, - int level, - struct buffer_head *bh) + int level, struct buffer_head *bh) { int maxlevel, ret; struct nilfs_btree_node *parent; + struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap); __u64 ptr; get_bh(bh); path[level].bp_bh = bh; - ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel); + ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel, + dat); if (ret < 0) goto out; @@ -1961,12 +1927,12 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree, parent = nilfs_btree_get_node(btree, path, level + 1); ptr = nilfs_btree_node_get_ptr(btree, parent, path[level + 1].bp_index); - ret = nilfs_bmap_mark_dirty(&btree->bt_bmap, ptr); + ret = nilfs_dat_mark_dirty(dat, ptr); if (ret < 0) goto out; } - nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh); + nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh, dat); out: brelse(path[level].bp_bh); @@ -1986,15 +1952,15 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap, WARN_ON(!buffer_dirty(bh)); btree = (struct nilfs_btree *)bmap; - path = nilfs_btree_alloc_path(btree); + path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(btree, path); + nilfs_btree_init_path(path); if (buffer_nilfs_node(bh)) { node = (struct nilfs_btree_node *)bh->b_data; - key = nilfs_btree_node_get_key(btree, node, 0); - level = nilfs_btree_node_get_level(btree, node); + key = nilfs_btree_node_get_key(node, 0); + level = nilfs_btree_node_get_level(node); } else { key = nilfs_bmap_data_get_key(bmap, bh); level = NILFS_BTREE_LEVEL_DATA; @@ -2013,8 +1979,8 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap, nilfs_btree_propagate_p(btree, path, level, bh); out: - nilfs_btree_clear_path(btree, path); - nilfs_btree_free_path(btree, path); + nilfs_btree_release_path(path); + nilfs_btree_free_path(path); return ret; } @@ -2022,7 +1988,7 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap, static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap, struct buffer_head *bh) { - return nilfs_bmap_mark_dirty(bmap, bh->b_blocknr); + return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), bh->b_blocknr); } static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree, @@ -2037,12 +2003,12 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree, get_bh(bh); node = (struct nilfs_btree_node *)bh->b_data; - key = nilfs_btree_node_get_key(btree, node, 0); - level = nilfs_btree_node_get_level(btree, node); + key = nilfs_btree_node_get_key(node, 0); + level = nilfs_btree_node_get_level(node); list_for_each(head, &lists[level]) { cbh = list_entry(head, struct buffer_head, b_assoc_buffers); cnode = (struct nilfs_btree_node *)cbh->b_data; - ckey = nilfs_btree_node_get_key(btree, cnode, 0); + ckey = nilfs_btree_node_get_key(cnode, 0); if (key < ckey) break; } @@ -2120,8 +2086,7 @@ static int nilfs_btree_assign_p(struct nilfs_btree *btree, nilfs_btree_node_set_ptr(btree, parent, path[level + 1].bp_index, blocknr); - key = nilfs_btree_node_get_key(btree, parent, - path[level + 1].bp_index); + key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index); /* on-disk format */ binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key); binfo->bi_dat.bi_level = level; @@ -2137,6 +2102,7 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree, union nilfs_binfo *binfo) { struct nilfs_btree_node *parent; + struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap); __u64 key; __u64 ptr; union nilfs_bmap_ptr_req req; @@ -2146,12 +2112,12 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree, ptr = nilfs_btree_node_get_ptr(btree, parent, path[level + 1].bp_index); req.bpr_ptr = ptr; - ret = nilfs_bmap_start_v(&btree->bt_bmap, &req, blocknr); - if (unlikely(ret < 0)) + ret = nilfs_dat_prepare_start(dat, &req.bpr_req); + if (ret < 0) return ret; + nilfs_dat_commit_start(dat, &req.bpr_req, blocknr); - key = nilfs_btree_node_get_key(btree, parent, - path[level + 1].bp_index); + key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index); /* on-disk format */ binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); @@ -2171,15 +2137,15 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap, int level, ret; btree = (struct nilfs_btree *)bmap; - path = nilfs_btree_alloc_path(btree); + path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(btree, path); + nilfs_btree_init_path(path); if (buffer_nilfs_node(*bh)) { node = (struct nilfs_btree_node *)(*bh)->b_data; - key = nilfs_btree_node_get_key(btree, node, 0); - level = nilfs_btree_node_get_level(btree, node); + key = nilfs_btree_node_get_key(node, 0); + level = nilfs_btree_node_get_level(node); } else { key = nilfs_bmap_data_get_key(bmap, *bh); level = NILFS_BTREE_LEVEL_DATA; @@ -2196,8 +2162,8 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap, nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo); out: - nilfs_btree_clear_path(btree, path); - nilfs_btree_free_path(btree, path); + nilfs_btree_release_path(path); + nilfs_btree_free_path(path); return ret; } @@ -2207,19 +2173,18 @@ static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap, sector_t blocknr, union nilfs_binfo *binfo) { - struct nilfs_btree *btree; struct nilfs_btree_node *node; __u64 key; int ret; - btree = (struct nilfs_btree *)bmap; - ret = nilfs_bmap_move_v(bmap, (*bh)->b_blocknr, blocknr); + ret = nilfs_dat_move(nilfs_bmap_get_dat(bmap), (*bh)->b_blocknr, + blocknr); if (ret < 0) return ret; if (buffer_nilfs_node(*bh)) { node = (struct nilfs_btree_node *)(*bh)->b_data; - key = nilfs_btree_node_get_key(btree, node, 0); + key = nilfs_btree_node_get_key(node, 0); } else key = nilfs_bmap_data_get_key(bmap, *bh); @@ -2239,10 +2204,10 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level) int ret; btree = (struct nilfs_btree *)bmap; - path = nilfs_btree_alloc_path(btree); + path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(btree, path); + nilfs_btree_init_path(path); ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1); if (ret < 0) { @@ -2262,8 +2227,8 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level) nilfs_bmap_set_dirty(&btree->bt_bmap); out: - nilfs_btree_clear_path(btree, path); - nilfs_btree_free_path(btree, path); + nilfs_btree_release_path(path); + nilfs_btree_free_path(path); return ret; } diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index 7d49813f66d6..1c6cfb59128d 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -307,7 +307,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh); if (ret < 0) { if (ret != -ENOENT) - goto out_header; + break; /* skip hole */ ret = 0; continue; @@ -340,7 +340,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, continue; printk(KERN_ERR "%s: cannot delete block\n", __func__); - goto out_header; + break; } } @@ -358,7 +358,6 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, kunmap_atomic(kaddr, KM_USER0); } - out_header: brelse(header_bh); out_sem: @@ -816,8 +815,10 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno) void *kaddr; int ret; - if (cno == 0) - return -ENOENT; /* checkpoint number 0 is invalid */ + /* CP number is invalid if it's zero or larger than the + largest exist one.*/ + if (cno == 0 || cno >= nilfs_mdt_cno(cpfile)) + return -ENOENT; down_read(&NILFS_MDT(cpfile)->mi_sem); ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh); @@ -825,7 +826,10 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno) goto out; kaddr = kmap_atomic(bh->b_page, KM_USER0); cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); - ret = nilfs_checkpoint_snapshot(cp); + if (nilfs_checkpoint_invalid(cp)) + ret = -ENOENT; + else + ret = nilfs_checkpoint_snapshot(cp); kunmap_atomic(kaddr, KM_USER0); brelse(bh); diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h index 788a45950197..debea896e701 100644 --- a/fs/nilfs2/cpfile.h +++ b/fs/nilfs2/cpfile.h @@ -27,8 +27,6 @@ #include <linux/buffer_head.h> #include <linux/nilfs2_fs.h> -#define NILFS_CPFILE_GFP NILFS_MDT_GFP - int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int, struct nilfs_checkpoint **, diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index 0b2710e2d565..1ff8e15bd36b 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -109,12 +109,6 @@ void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req) nilfs_palloc_commit_free_entry(dat, req); } -void nilfs_dat_abort_free(struct inode *dat, struct nilfs_palloc_req *req) -{ - nilfs_dat_abort_entry(dat, req); - nilfs_palloc_abort_free_entry(dat, req); -} - int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req) { int ret; @@ -134,26 +128,12 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req, entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, req->pr_entry_bh, kaddr); entry->de_start = cpu_to_le64(nilfs_mdt_cno(dat)); - if (entry->de_blocknr != cpu_to_le64(0) || - entry->de_end != cpu_to_le64(NILFS_CNO_MAX)) { - printk(KERN_CRIT - "%s: vbn = %llu, start = %llu, end = %llu, pbn = %llu\n", - __func__, (unsigned long long)req->pr_entry_nr, - (unsigned long long)le64_to_cpu(entry->de_start), - (unsigned long long)le64_to_cpu(entry->de_end), - (unsigned long long)le64_to_cpu(entry->de_blocknr)); - } entry->de_blocknr = cpu_to_le64(blocknr); kunmap_atomic(kaddr, KM_USER0); nilfs_dat_commit_entry(dat, req); } -void nilfs_dat_abort_start(struct inode *dat, struct nilfs_palloc_req *req) -{ - nilfs_dat_abort_entry(dat, req); -} - int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req) { struct nilfs_dat_entry *entry; @@ -231,6 +211,37 @@ void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req) nilfs_dat_abort_entry(dat, req); } +int nilfs_dat_prepare_update(struct inode *dat, + struct nilfs_palloc_req *oldreq, + struct nilfs_palloc_req *newreq) +{ + int ret; + + ret = nilfs_dat_prepare_end(dat, oldreq); + if (!ret) { + ret = nilfs_dat_prepare_alloc(dat, newreq); + if (ret < 0) + nilfs_dat_abort_end(dat, oldreq); + } + return ret; +} + +void nilfs_dat_commit_update(struct inode *dat, + struct nilfs_palloc_req *oldreq, + struct nilfs_palloc_req *newreq, int dead) +{ + nilfs_dat_commit_end(dat, oldreq, dead); + nilfs_dat_commit_alloc(dat, newreq); +} + +void nilfs_dat_abort_update(struct inode *dat, + struct nilfs_palloc_req *oldreq, + struct nilfs_palloc_req *newreq) +{ + nilfs_dat_abort_end(dat, oldreq); + nilfs_dat_abort_alloc(dat, newreq); +} + /** * nilfs_dat_mark_dirty - * @dat: DAT file inode diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h index d328b81eead4..406070d3ff49 100644 --- a/fs/nilfs2/dat.h +++ b/fs/nilfs2/dat.h @@ -27,7 +27,6 @@ #include <linux/buffer_head.h> #include <linux/fs.h> -#define NILFS_DAT_GFP NILFS_MDT_GFP struct nilfs_palloc_req; @@ -39,10 +38,15 @@ void nilfs_dat_abort_alloc(struct inode *, struct nilfs_palloc_req *); int nilfs_dat_prepare_start(struct inode *, struct nilfs_palloc_req *); void nilfs_dat_commit_start(struct inode *, struct nilfs_palloc_req *, sector_t); -void nilfs_dat_abort_start(struct inode *, struct nilfs_palloc_req *); int nilfs_dat_prepare_end(struct inode *, struct nilfs_palloc_req *); void nilfs_dat_commit_end(struct inode *, struct nilfs_palloc_req *, int); void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *); +int nilfs_dat_prepare_update(struct inode *, struct nilfs_palloc_req *, + struct nilfs_palloc_req *); +void nilfs_dat_commit_update(struct inode *, struct nilfs_palloc_req *, + struct nilfs_palloc_req *, int); +void nilfs_dat_abort_update(struct inode *, struct nilfs_palloc_req *, + struct nilfs_palloc_req *); int nilfs_dat_mark_dirty(struct inode *, __u64); int nilfs_dat_freev(struct inode *, __u64 *, size_t); diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 54100acc1102..1a4fa04cf071 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -43,7 +43,6 @@ */ #include <linux/pagemap.h> -#include <linux/smp_lock.h> #include "nilfs.h" #include "page.h" diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c index 342d9765df8d..d369ac718277 100644 --- a/fs/nilfs2/direct.c +++ b/fs/nilfs2/direct.c @@ -125,106 +125,64 @@ static void nilfs_direct_set_target_v(struct nilfs_direct *direct, direct->d_bmap.b_last_allocated_ptr = ptr; } -static int nilfs_direct_prepare_insert(struct nilfs_direct *direct, - __u64 key, - union nilfs_bmap_ptr_req *req, - struct nilfs_bmap_stats *stats) -{ - int ret; - - if (NILFS_BMAP_USE_VBN(&direct->d_bmap)) - req->bpr_ptr = nilfs_direct_find_target_v(direct, key); - ret = nilfs_bmap_prepare_alloc_ptr(&direct->d_bmap, req); - if (ret < 0) - return ret; - - stats->bs_nblocks = 1; - return 0; -} - -static void nilfs_direct_commit_insert(struct nilfs_direct *direct, - union nilfs_bmap_ptr_req *req, - __u64 key, __u64 ptr) -{ - struct buffer_head *bh; - - /* ptr must be a pointer to a buffer head. */ - bh = (struct buffer_head *)((unsigned long)ptr); - set_buffer_nilfs_volatile(bh); - - nilfs_bmap_commit_alloc_ptr(&direct->d_bmap, req); - nilfs_direct_set_ptr(direct, key, req->bpr_ptr); - - if (!nilfs_bmap_dirty(&direct->d_bmap)) - nilfs_bmap_set_dirty(&direct->d_bmap); - - if (NILFS_BMAP_USE_VBN(&direct->d_bmap)) - nilfs_direct_set_target_v(direct, key, req->bpr_ptr); -} - static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) { - struct nilfs_direct *direct; + struct nilfs_direct *direct = (struct nilfs_direct *)bmap; union nilfs_bmap_ptr_req req; - struct nilfs_bmap_stats stats; + struct inode *dat = NULL; + struct buffer_head *bh; int ret; - direct = (struct nilfs_direct *)bmap; if (key > NILFS_DIRECT_KEY_MAX) return -ENOENT; if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR) return -EEXIST; - ret = nilfs_direct_prepare_insert(direct, key, &req, &stats); - if (ret < 0) - return ret; - nilfs_direct_commit_insert(direct, &req, key, ptr); - nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); + if (NILFS_BMAP_USE_VBN(bmap)) { + req.bpr_ptr = nilfs_direct_find_target_v(direct, key); + dat = nilfs_bmap_get_dat(bmap); + } + ret = nilfs_bmap_prepare_alloc_ptr(bmap, &req, dat); + if (!ret) { + /* ptr must be a pointer to a buffer head. */ + bh = (struct buffer_head *)((unsigned long)ptr); + set_buffer_nilfs_volatile(bh); - return 0; -} + nilfs_bmap_commit_alloc_ptr(bmap, &req, dat); + nilfs_direct_set_ptr(direct, key, req.bpr_ptr); -static int nilfs_direct_prepare_delete(struct nilfs_direct *direct, - union nilfs_bmap_ptr_req *req, - __u64 key, - struct nilfs_bmap_stats *stats) -{ - int ret; + if (!nilfs_bmap_dirty(bmap)) + nilfs_bmap_set_dirty(bmap); - req->bpr_ptr = nilfs_direct_get_ptr(direct, key); - ret = nilfs_bmap_prepare_end_ptr(&direct->d_bmap, req); - if (!ret) - stats->bs_nblocks = 1; - return ret; -} + if (NILFS_BMAP_USE_VBN(bmap)) + nilfs_direct_set_target_v(direct, key, req.bpr_ptr); -static void nilfs_direct_commit_delete(struct nilfs_direct *direct, - union nilfs_bmap_ptr_req *req, - __u64 key) -{ - nilfs_bmap_commit_end_ptr(&direct->d_bmap, req); - nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR); + nilfs_bmap_add_blocks(bmap, 1); + } + return ret; } static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key) { - struct nilfs_direct *direct; + struct nilfs_direct *direct = (struct nilfs_direct *)bmap; union nilfs_bmap_ptr_req req; - struct nilfs_bmap_stats stats; + struct inode *dat; int ret; - direct = (struct nilfs_direct *)bmap; - if ((key > NILFS_DIRECT_KEY_MAX) || + if (key > NILFS_DIRECT_KEY_MAX || nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR) return -ENOENT; - ret = nilfs_direct_prepare_delete(direct, &req, key, &stats); - if (ret < 0) - return ret; - nilfs_direct_commit_delete(direct, &req, key); - nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks); + dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL; + req.bpr_ptr = nilfs_direct_get_ptr(direct, key); - return 0; + ret = nilfs_bmap_prepare_end_ptr(bmap, &req, dat); + if (!ret) { + nilfs_bmap_commit_end_ptr(bmap, &req, dat); + nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR); + nilfs_bmap_sub_blocks(bmap, 1); + } + return ret; } static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp) @@ -310,59 +268,56 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap, return 0; } -static int nilfs_direct_propagate_v(struct nilfs_direct *direct, - struct buffer_head *bh) +static int nilfs_direct_propagate(const struct nilfs_bmap *bmap, + struct buffer_head *bh) { - union nilfs_bmap_ptr_req oldreq, newreq; + struct nilfs_direct *direct = (struct nilfs_direct *)bmap; + struct nilfs_palloc_req oldreq, newreq; + struct inode *dat; __u64 key; __u64 ptr; int ret; - key = nilfs_bmap_data_get_key(&direct->d_bmap, bh); + if (!NILFS_BMAP_USE_VBN(bmap)) + return 0; + + dat = nilfs_bmap_get_dat(bmap); + key = nilfs_bmap_data_get_key(bmap, bh); ptr = nilfs_direct_get_ptr(direct, key); if (!buffer_nilfs_volatile(bh)) { - oldreq.bpr_ptr = ptr; - newreq.bpr_ptr = ptr; - ret = nilfs_bmap_prepare_update_v(&direct->d_bmap, &oldreq, - &newreq); + oldreq.pr_entry_nr = ptr; + newreq.pr_entry_nr = ptr; + ret = nilfs_dat_prepare_update(dat, &oldreq, &newreq); if (ret < 0) return ret; - nilfs_bmap_commit_update_v(&direct->d_bmap, &oldreq, &newreq); + nilfs_dat_commit_update(dat, &oldreq, &newreq, + bmap->b_ptr_type == NILFS_BMAP_PTR_VS); set_buffer_nilfs_volatile(bh); - nilfs_direct_set_ptr(direct, key, newreq.bpr_ptr); + nilfs_direct_set_ptr(direct, key, newreq.pr_entry_nr); } else - ret = nilfs_bmap_mark_dirty(&direct->d_bmap, ptr); + ret = nilfs_dat_mark_dirty(dat, ptr); return ret; } -static int nilfs_direct_propagate(const struct nilfs_bmap *bmap, - struct buffer_head *bh) -{ - struct nilfs_direct *direct = (struct nilfs_direct *)bmap; - - return NILFS_BMAP_USE_VBN(bmap) ? - nilfs_direct_propagate_v(direct, bh) : 0; -} - static int nilfs_direct_assign_v(struct nilfs_direct *direct, __u64 key, __u64 ptr, struct buffer_head **bh, sector_t blocknr, union nilfs_binfo *binfo) { + struct inode *dat = nilfs_bmap_get_dat(&direct->d_bmap); union nilfs_bmap_ptr_req req; int ret; req.bpr_ptr = ptr; - ret = nilfs_bmap_start_v(&direct->d_bmap, &req, blocknr); - if (unlikely(ret < 0)) - return ret; - - binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); - binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); - - return 0; + ret = nilfs_dat_prepare_start(dat, &req.bpr_req); + if (!ret) { + nilfs_dat_commit_start(dat, &req.bpr_req, blocknr); + binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); + binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); + } + return ret; } static int nilfs_direct_assign_p(struct nilfs_direct *direct, diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h index 5d30a35679b5..ecc3ba76db47 100644 --- a/fs/nilfs2/ifile.h +++ b/fs/nilfs2/ifile.h @@ -31,7 +31,6 @@ #include "mdt.h" #include "alloc.h" -#define NILFS_IFILE_GFP NILFS_MDT_GFP static inline struct nilfs_inode * nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh) diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index fe9d8f2a13f8..807e584b163d 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -430,7 +430,8 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino, raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh); - if (nilfs_read_inode_common(inode, raw_inode)) + err = nilfs_read_inode_common(inode, raw_inode); + if (err) goto failed_unmap; if (S_ISREG(inode->i_mode)) { diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 6ea5f872e2de..6572ea4bc4df 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -442,12 +442,6 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs, const char *msg; int ret; - ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]); - if (ret < 0) { - msg = "cannot read source blocks"; - goto failed; - } - ret = nilfs_ioctl_delete_checkpoints(nilfs, &argv[1], kbufs[1]); if (ret < 0) { /* @@ -548,7 +542,25 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, } } - ret = nilfs_clean_segments(inode->i_sb, argv, kbufs); + /* + * nilfs_ioctl_move_blocks() will call nilfs_gc_iget(), + * which will operates an inode list without blocking. + * To protect the list from concurrent operations, + * nilfs_ioctl_move_blocks should be atomic operation. + */ + if (test_and_set_bit(THE_NILFS_GC_RUNNING, &nilfs->ns_flags)) { + ret = -EBUSY; + goto out_free; + } + + ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]); + if (ret < 0) + printk(KERN_ERR "NILFS: GC failed during preparation: " + "cannot read source blocks: err=%d\n", ret); + else + ret = nilfs_clean_segments(inode->i_sb, argv, kbufs); + + clear_nilfs_gc_running(nilfs); out_free: while (--n >= 0) diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 3d3ddb3f5177..156bf6091a96 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -103,15 +103,12 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block, goto failed_unlock; err = -EEXIST; - if (buffer_uptodate(bh) || buffer_mapped(bh)) + if (buffer_uptodate(bh)) goto failed_bh; -#if 0 - /* The uptodate flag is not protected by the page lock, but - the mapped flag is. Thus, we don't have to wait the buffer. */ + wait_on_buffer(bh); if (buffer_uptodate(bh)) goto failed_bh; -#endif bh->b_bdev = nilfs->ns_bdev; err = nilfs_mdt_insert_new_block(inode, block, bh, init_block); @@ -139,7 +136,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, int mode, struct buffer_head **out_bh) { struct buffer_head *bh; - unsigned long blknum = 0; + __u64 blknum = 0; int ret = -ENOMEM; bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0); @@ -162,17 +159,15 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, unlock_buffer(bh); goto out; } - if (!buffer_mapped(bh)) { /* unused buffer */ - ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff, - &blknum); - if (unlikely(ret)) { - unlock_buffer(bh); - goto failed_bh; - } - bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev; - bh->b_blocknr = blknum; - set_buffer_mapped(bh); + + ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff, &blknum); + if (unlikely(ret)) { + unlock_buffer(bh); + goto failed_bh; } + bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev; + bh->b_blocknr = (sector_t)blknum; + set_buffer_mapped(bh); bh->b_end_io = end_buffer_read_sync; get_bh(bh); @@ -402,6 +397,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) struct inode *inode = container_of(page->mapping, struct inode, i_data); struct super_block *sb = inode->i_sb; + struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs; struct nilfs_sb_info *writer = NULL; int err = 0; @@ -411,9 +407,12 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) if (page->mapping->assoc_mapping) return 0; /* Do not request flush for shadow page cache */ if (!sb) { - writer = nilfs_get_writer(NILFS_MDT(inode)->mi_nilfs); - if (!writer) + down_read(&nilfs->ns_writer_sem); + writer = nilfs->ns_writer; + if (!writer) { + up_read(&nilfs->ns_writer_sem); return -EROFS; + } sb = writer->s_super; } @@ -423,7 +422,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) nilfs_flush_segment(sb, inode->i_ino); if (writer) - nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs); + up_read(&nilfs->ns_writer_sem); return err; } @@ -514,9 +513,10 @@ nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb, } struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb, - ino_t ino, gfp_t gfp_mask) + ino_t ino) { - struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino, gfp_mask); + struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino, + NILFS_MDT_GFP); if (!inode) return NULL; diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h index df683e0bca6a..431599733c9b 100644 --- a/fs/nilfs2/mdt.h +++ b/fs/nilfs2/mdt.h @@ -74,8 +74,7 @@ int nilfs_mdt_forget_block(struct inode *, unsigned long); int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long); int nilfs_mdt_fetch_dirty(struct inode *); -struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t, - gfp_t); +struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t); struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *, ino_t, gfp_t); void nilfs_mdt_destroy(struct inode *); diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index d80cc71be749..6dc83591d118 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -552,7 +552,8 @@ static int recover_dsync_blocks(struct nilfs_sb_info *sbi, printk(KERN_WARNING "NILFS warning: error recovering data block " "(err=%d, ino=%lu, block-offset=%llu)\n", - err, rb->ino, (unsigned long long)rb->blkoff); + err, (unsigned long)rb->ino, + (unsigned long long)rb->blkoff); if (!err2) err2 = err; next: diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 9e3fe17bb96b..e6d9e37fa241 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -316,10 +316,10 @@ static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start, { struct bio *bio; - bio = bio_alloc(GFP_NOWAIT, nr_vecs); + bio = bio_alloc(GFP_NOIO, nr_vecs); if (bio == NULL) { while (!bio && (nr_vecs >>= 1)) - bio = bio_alloc(GFP_NOWAIT, nr_vecs); + bio = bio_alloc(GFP_NOIO, nr_vecs); } if (likely(bio)) { bio->bi_bdev = sb->s_bdev; diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index aa977549919e..683df89dbae5 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1829,26 +1829,13 @@ static int nilfs_segctor_write(struct nilfs_sc_info *sci, err = nilfs_segbuf_write(segbuf, &wi); res = nilfs_segbuf_wait(segbuf, &wi); - err = unlikely(err) ? : res; - if (unlikely(err)) + err = err ? : res; + if (err) return err; } return 0; } -static int nilfs_page_has_uncleared_buffer(struct page *page) -{ - struct buffer_head *head, *bh; - - head = bh = page_buffers(page); - do { - if (buffer_dirty(bh) && !list_empty(&bh->b_assoc_buffers)) - return 1; - bh = bh->b_this_page; - } while (bh != head); - return 0; -} - static void __nilfs_end_page_io(struct page *page, int err) { if (!err) { @@ -1872,13 +1859,26 @@ static void nilfs_end_page_io(struct page *page, int err) if (!page) return; - if (buffer_nilfs_node(page_buffers(page)) && - nilfs_page_has_uncleared_buffer(page)) - /* For b-tree node pages, this function may be called twice - or more because they might be split in a segment. - This check assures that cleanup has been done for all - buffers in a split btnode page. */ + if (buffer_nilfs_node(page_buffers(page)) && !PageWriteback(page)) { + /* + * For b-tree node pages, this function may be called twice + * or more because they might be split in a segment. + */ + if (PageDirty(page)) { + /* + * For pages holding split b-tree node buffers, dirty + * flag on the buffers may be cleared discretely. + * In that case, the page is once redirtied for + * remaining buffers, and it must be cancelled if + * all the buffers get cleaned later. + */ + lock_page(page); + if (nilfs_page_buffers_clean(page)) + __nilfs_clear_page_dirty(page); + unlock_page(page); + } return; + } __nilfs_end_page_io(page, err); } @@ -1940,7 +1940,7 @@ static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci, } if (bh->b_page != fs_page) { nilfs_end_page_io(fs_page, err); - if (unlikely(fs_page == failed_page)) + if (fs_page && fs_page == failed_page) goto done; fs_page = bh->b_page; } @@ -2501,7 +2501,8 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) && nilfs_discontinued(nilfs)) { down_write(&nilfs->ns_sem); - req->sb_err = nilfs_commit_super(sbi, 0); + req->sb_err = nilfs_commit_super(sbi, + nilfs_altsb_need_update(nilfs)); up_write(&nilfs->ns_sem); } } @@ -2689,6 +2690,7 @@ static int nilfs_segctor_thread(void *arg) } else { DEFINE_WAIT(wait); int should_sleep = 1; + struct the_nilfs *nilfs; prepare_to_wait(&sci->sc_wait_daemon, &wait, TASK_INTERRUPTIBLE); @@ -2709,6 +2711,9 @@ static int nilfs_segctor_thread(void *arg) finish_wait(&sci->sc_wait_daemon, &wait); timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) && time_after_eq(jiffies, sci->sc_timer->expires)); + nilfs = sci->sc_sbi->s_nilfs; + if (sci->sc_super->s_dirt && nilfs_sb_need_update(nilfs)) + set_nilfs_discontinued(nilfs); } goto loop; diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h index a2c4d76c3366..0e99e5c0bd0f 100644 --- a/fs/nilfs2/sufile.h +++ b/fs/nilfs2/sufile.h @@ -28,7 +28,6 @@ #include <linux/nilfs2_fs.h> #include "mdt.h" -#define NILFS_SUFILE_GFP NILFS_MDT_GFP static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile) { diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 8e2ec43b18f4..55f3d6b60732 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -50,6 +50,8 @@ #include <linux/writeback.h> #include <linux/kobject.h> #include <linux/exportfs.h> +#include <linux/seq_file.h> +#include <linux/mount.h> #include "nilfs.h" #include "mdt.h" #include "alloc.h" @@ -65,7 +67,6 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem " "(NILFS)"); MODULE_LICENSE("GPL"); -static void nilfs_write_super(struct super_block *sb); static int nilfs_remount(struct super_block *sb, int *flags, char *data); /** @@ -311,9 +312,6 @@ static void nilfs_put_super(struct super_block *sb) lock_kernel(); - if (sb->s_dirt) - nilfs_write_super(sb); - nilfs_detach_segment_constructor(sbi); if (!(sb->s_flags & MS_RDONLY)) { @@ -336,63 +334,21 @@ static void nilfs_put_super(struct super_block *sb) unlock_kernel(); } -/** - * nilfs_write_super - write super block(s) of NILFS - * @sb: super_block - * - * nilfs_write_super() gets a fs-dependent lock, writes super block(s), and - * clears s_dirt. This function is called in the section protected by - * lock_super(). - * - * The s_dirt flag is managed by each filesystem and we protect it by ns_sem - * of the struct the_nilfs. Lock order must be as follows: - * - * 1. lock_super() - * 2. down_write(&nilfs->ns_sem) - * - * Inside NILFS, locking ns_sem is enough to protect s_dirt and the buffer - * of the super block (nilfs->ns_sbp[]). - * - * In most cases, VFS functions call lock_super() before calling these - * methods. So we must be careful not to bring on deadlocks when using - * lock_super(); see generic_shutdown_super(), write_super(), and so on. - * - * Note that order of lock_kernel() and lock_super() depends on contexts - * of VFS. We should also note that lock_kernel() can be used in its - * protective section and only the outermost one has an effect. - */ -static void nilfs_write_super(struct super_block *sb) +static int nilfs_sync_fs(struct super_block *sb, int wait) { struct nilfs_sb_info *sbi = NILFS_SB(sb); struct the_nilfs *nilfs = sbi->s_nilfs; - - down_write(&nilfs->ns_sem); - if (!(sb->s_flags & MS_RDONLY)) { - struct nilfs_super_block **sbp = nilfs->ns_sbp; - u64 t = get_seconds(); - int dupsb; - - if (!nilfs_discontinued(nilfs) && t >= nilfs->ns_sbwtime[0] && - t < nilfs->ns_sbwtime[0] + NILFS_SB_FREQ) { - up_write(&nilfs->ns_sem); - return; - } - dupsb = sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ; - nilfs_commit_super(sbi, dupsb); - } - sb->s_dirt = 0; - up_write(&nilfs->ns_sem); -} - -static int nilfs_sync_fs(struct super_block *sb, int wait) -{ int err = 0; - nilfs_write_super(sb); - /* This function is called when super block should be written back */ if (wait) err = nilfs_construct_segment(sb); + + down_write(&nilfs->ns_sem); + if (sb->s_dirt) + nilfs_commit_super(sbi, 1); + up_write(&nilfs->ns_sem); + return err; } @@ -407,8 +363,7 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno) list_add(&sbi->s_list, &nilfs->ns_supers); up_write(&nilfs->ns_super_sem); - sbi->s_ifile = nilfs_mdt_new( - nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP); + sbi->s_ifile = nilfs_mdt_new(nilfs, sbi->s_super, NILFS_IFILE_INO); if (!sbi->s_ifile) return -ENOMEM; @@ -416,8 +371,10 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno) if (unlikely(err)) goto failed; + down_read(&nilfs->ns_segctor_sem); err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp, &bh_cp); + up_read(&nilfs->ns_segctor_sem); if (unlikely(err)) { if (err == -ENOENT || err == -EINVAL) { printk(KERN_ERR @@ -527,6 +484,26 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } +static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs) +{ + struct super_block *sb = vfs->mnt_sb; + struct nilfs_sb_info *sbi = NILFS_SB(sb); + + if (!nilfs_test_opt(sbi, BARRIER)) + seq_printf(seq, ",barrier=off"); + if (nilfs_test_opt(sbi, SNAPSHOT)) + seq_printf(seq, ",cp=%llu", + (unsigned long long int)sbi->s_snapshot_cno); + if (nilfs_test_opt(sbi, ERRORS_RO)) + seq_printf(seq, ",errors=remount-ro"); + if (nilfs_test_opt(sbi, ERRORS_PANIC)) + seq_printf(seq, ",errors=panic"); + if (nilfs_test_opt(sbi, STRICT_ORDER)) + seq_printf(seq, ",order=strict"); + + return 0; +} + static struct super_operations nilfs_sops = { .alloc_inode = nilfs_alloc_inode, .destroy_inode = nilfs_destroy_inode, @@ -536,7 +513,7 @@ static struct super_operations nilfs_sops = { /* .drop_inode = nilfs_drop_inode, */ .delete_inode = nilfs_delete_inode, .put_super = nilfs_put_super, - .write_super = nilfs_write_super, + /* .write_super = nilfs_write_super, */ .sync_fs = nilfs_sync_fs, /* .write_super_lockfs */ /* .unlockfs */ @@ -544,7 +521,7 @@ static struct super_operations nilfs_sops = { .remount_fs = nilfs_remount, .clear_inode = nilfs_clear_inode, /* .umount_begin */ - /* .show_options */ + .show_options = nilfs_show_options }; static struct inode * @@ -814,10 +791,15 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent, if (sb->s_flags & MS_RDONLY) { if (nilfs_test_opt(sbi, SNAPSHOT)) { + down_read(&nilfs->ns_segctor_sem); err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile, sbi->s_snapshot_cno); - if (err < 0) + up_read(&nilfs->ns_segctor_sem); + if (err < 0) { + if (err == -ENOENT) + err = -EINVAL; goto failed_sbi; + } if (!err) { printk(KERN_ERR "NILFS: The specified checkpoint is " @@ -1125,10 +1107,6 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, */ sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno); - if (!sd.cno) - /* trying to get the latest checkpoint. */ - sd.cno = nilfs_last_cno(nilfs); - /* * Get super block instance holding the nilfs_sb_info struct. * A new instance is allocated if no existing mount is present or diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 8b8889825716..ad391a8c3e7e 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -68,12 +68,11 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev) nilfs->ns_bdev = bdev; atomic_set(&nilfs->ns_count, 1); - atomic_set(&nilfs->ns_writer_refcount, -1); atomic_set(&nilfs->ns_ndirtyblks, 0); init_rwsem(&nilfs->ns_sem); init_rwsem(&nilfs->ns_super_sem); mutex_init(&nilfs->ns_mount_mutex); - mutex_init(&nilfs->ns_writer_mutex); + init_rwsem(&nilfs->ns_writer_sem); INIT_LIST_HEAD(&nilfs->ns_list); INIT_LIST_HEAD(&nilfs->ns_supers); spin_lock_init(&nilfs->ns_last_segment_lock); @@ -188,23 +187,19 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs, inode_size = nilfs->ns_inode_size; err = -ENOMEM; - nilfs->ns_dat = nilfs_mdt_new( - nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP); + nilfs->ns_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO); if (unlikely(!nilfs->ns_dat)) goto failed; - nilfs->ns_gc_dat = nilfs_mdt_new( - nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP); + nilfs->ns_gc_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO); if (unlikely(!nilfs->ns_gc_dat)) goto failed_dat; - nilfs->ns_cpfile = nilfs_mdt_new( - nilfs, NULL, NILFS_CPFILE_INO, NILFS_CPFILE_GFP); + nilfs->ns_cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO); if (unlikely(!nilfs->ns_cpfile)) goto failed_gc_dat; - nilfs->ns_sufile = nilfs_mdt_new( - nilfs, NULL, NILFS_SUFILE_INO, NILFS_SUFILE_GFP); + nilfs->ns_sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO); if (unlikely(!nilfs->ns_sufile)) goto failed_cpfile; @@ -596,9 +591,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data) nilfs->ns_mount_state = le16_to_cpu(sbp->s_state); - bdi = nilfs->ns_bdev->bd_inode_backing_dev_info; - if (!bdi) - bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info; + bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info; nilfs->ns_bdi = bdi ? : &default_backing_dev_info; /* Finding last segment */ diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index e8adbffc626f..20abd55881e0 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -37,6 +37,7 @@ enum { THE_NILFS_LOADED, /* Roll-back/roll-forward has done and the latest checkpoint was loaded */ THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */ + THE_NILFS_GC_RUNNING, /* gc process is running */ }; /** @@ -50,8 +51,7 @@ enum { * @ns_sem: semaphore for shared states * @ns_super_sem: semaphore for global operations across super block instances * @ns_mount_mutex: mutex protecting mount process of nilfs - * @ns_writer_mutex: mutex protecting ns_writer attach/detach - * @ns_writer_refcount: number of referrers on ns_writer + * @ns_writer_sem: semaphore protecting ns_writer attach/detach * @ns_current: back pointer to current mount * @ns_sbh: buffer heads of on-disk super blocks * @ns_sbp: pointers to super block data @@ -100,8 +100,7 @@ struct the_nilfs { struct rw_semaphore ns_sem; struct rw_semaphore ns_super_sem; struct mutex ns_mount_mutex; - struct mutex ns_writer_mutex; - atomic_t ns_writer_refcount; + struct rw_semaphore ns_writer_sem; /* * components protected by ns_super_sem @@ -197,11 +196,26 @@ static inline int nilfs_##name(struct the_nilfs *nilfs) \ THE_NILFS_FNS(INIT, init) THE_NILFS_FNS(LOADED, loaded) THE_NILFS_FNS(DISCONTINUED, discontinued) +THE_NILFS_FNS(GC_RUNNING, gc_running) /* Minimum interval of periodical update of superblocks (in seconds) */ #define NILFS_SB_FREQ 10 #define NILFS_ALTSB_FREQ 60 /* spare superblock */ +static inline int nilfs_sb_need_update(struct the_nilfs *nilfs) +{ + u64 t = get_seconds(); + return t < nilfs->ns_sbwtime[0] || + t > nilfs->ns_sbwtime[0] + NILFS_SB_FREQ; +} + +static inline int nilfs_altsb_need_update(struct the_nilfs *nilfs) +{ + u64 t = get_seconds(); + struct nilfs_super_block **sbp = nilfs->ns_sbp; + return sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ; +} + void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64); struct the_nilfs *find_or_create_nilfs(struct block_device *); void put_nilfs(struct the_nilfs *); @@ -221,39 +235,26 @@ static inline void get_nilfs(struct the_nilfs *nilfs) atomic_inc(&nilfs->ns_count); } -static inline struct nilfs_sb_info *nilfs_get_writer(struct the_nilfs *nilfs) -{ - if (atomic_inc_and_test(&nilfs->ns_writer_refcount)) - mutex_lock(&nilfs->ns_writer_mutex); - return nilfs->ns_writer; -} - -static inline void nilfs_put_writer(struct the_nilfs *nilfs) -{ - if (atomic_add_negative(-1, &nilfs->ns_writer_refcount)) - mutex_unlock(&nilfs->ns_writer_mutex); -} - static inline void nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) { - mutex_lock(&nilfs->ns_writer_mutex); + down_write(&nilfs->ns_writer_sem); nilfs->ns_writer = sbi; - mutex_unlock(&nilfs->ns_writer_mutex); + up_write(&nilfs->ns_writer_sem); } static inline void nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) { - mutex_lock(&nilfs->ns_writer_mutex); + down_write(&nilfs->ns_writer_sem); if (sbi == nilfs->ns_writer) nilfs->ns_writer = NULL; - mutex_unlock(&nilfs->ns_writer_mutex); + up_write(&nilfs->ns_writer_sem); } static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi) { - if (!atomic_dec_and_test(&sbi->s_count)) + if (atomic_dec_and_test(&sbi->s_count)) kfree(sbi); } diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig index 31dac7e3b0f1..dffbb0911d02 100644 --- a/fs/notify/Kconfig +++ b/fs/notify/Kconfig @@ -1,15 +1,5 @@ config FSNOTIFY - bool "Filesystem notification backend" - default y - ---help--- - fsnotify is a backend for filesystem notification. fsnotify does - not provide any userspace interface but does provide the basis - needed for other notification schemes such as dnotify, inotify, - and fanotify. - - Say Y here to enable fsnotify suport. - - If unsure, say Y. + def_bool n source "fs/notify/dnotify/Kconfig" source "fs/notify/inotify/Kconfig" diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig index 904ff8d5405a..f9c1ca139d8f 100644 --- a/fs/notify/dnotify/Kconfig +++ b/fs/notify/dnotify/Kconfig @@ -1,6 +1,6 @@ config DNOTIFY bool "Dnotify support" - depends on FSNOTIFY + select FSNOTIFY default y help Dnotify is a directory-based per-fd file change notification system diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index ec2f7bd76818..037e878e03fc 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -159,7 +159,9 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const if (!group->ops->should_send_event(group, to_tell, mask)) continue; if (!event) { - event = fsnotify_create_event(to_tell, mask, data, data_is, file_name, cookie); + event = fsnotify_create_event(to_tell, mask, data, + data_is, file_name, cookie, + GFP_KERNEL); /* shit, we OOM'd and now we can't tell, maybe * someday someone else will want to do something * here */ diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig index 5356884289a1..3e56dbffe729 100644 --- a/fs/notify/inotify/Kconfig +++ b/fs/notify/inotify/Kconfig @@ -15,7 +15,7 @@ config INOTIFY config INOTIFY_USER bool "Inotify support for userspace" - depends on FSNOTIFY + select FSNOTIFY default y ---help--- Say Y here to enable inotify support for userspace, including the diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 47cd258fd24d..c9ee67b442e1 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -62,13 +62,14 @@ static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_ev event_priv->wd = wd; ret = fsnotify_add_notify_event(group, event, fsn_event_priv); - /* EEXIST is not an error */ - if (ret == -EEXIST) - ret = 0; - - /* did event_priv get attached? */ - if (list_empty(&fsn_event_priv->event_list)) + if (ret) { inotify_free_event_priv(fsn_event_priv); + /* EEXIST says we tail matched, EOVERFLOW isn't something + * to report up the stack. */ + if ((ret == -EEXIST) || + (ret == -EOVERFLOW)) + ret = 0; + } /* * If we hold the entry until after the event is on the queue @@ -104,16 +105,45 @@ static bool inotify_should_send_event(struct fsnotify_group *group, struct inode return send; } +/* + * This is NEVER supposed to be called. Inotify marks should either have been + * removed from the idr when the watch was removed or in the + * fsnotify_destroy_mark_by_group() call when the inotify instance was being + * torn down. This is only called if the idr is about to be freed but there + * are still marks in it. + */ static int idr_callback(int id, void *p, void *data) { - BUG(); + struct fsnotify_mark_entry *entry; + struct inotify_inode_mark_entry *ientry; + static bool warned = false; + + if (warned) + return 0; + + warned = false; + entry = p; + ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); + + WARN(1, "inotify closing but id=%d for entry=%p in group=%p still in " + "idr. Probably leaking memory\n", id, p, data); + + /* + * I'm taking the liberty of assuming that the mark in question is a + * valid address and I'm dereferencing it. This might help to figure + * out why we got here and the panic is no worse than the original + * BUG() that was here. + */ + if (entry) + printk(KERN_WARNING "entry->group=%p inode=%p wd=%d\n", + entry->group, entry->inode, ientry->wd); return 0; } static void inotify_free_group_priv(struct fsnotify_group *group) { /* ideally the idr is empty and we won't hit the BUG in teh callback */ - idr_for_each(&group->inotify_data.idr, idr_callback, NULL); + idr_for_each(&group->inotify_data.idr, idr_callback, group); idr_remove_all(&group->inotify_data.idr); idr_destroy(&group->inotify_data.idr); } diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index ff27a2965844..dcd2040d330c 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -47,9 +47,6 @@ static struct vfsmount *inotify_mnt __read_mostly; -/* this just sits here and wastes global memory. used to just pad userspace messages with zeros */ -static struct inotify_event nul_inotify_event; - /* these are configurable via /proc/sys/fs/inotify/ */ static int inotify_max_user_instances __read_mostly; static int inotify_max_queued_events __read_mostly; @@ -57,7 +54,6 @@ int inotify_max_user_watches __read_mostly; static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; struct kmem_cache *event_priv_cachep __read_mostly; -static struct fsnotify_event *inotify_ignored_event; /* * When inotify registers a new group it increments this and uses that @@ -158,7 +154,8 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, event = fsnotify_peek_notify_event(group); - event_size += roundup(event->name_len, event_size); + if (event->name_len) + event_size += roundup(event->name_len + 1, event_size); if (event_size > count) return ERR_PTR(-EINVAL); @@ -184,7 +181,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, struct fsnotify_event_private_data *fsn_priv; struct inotify_event_private_data *priv; size_t event_size = sizeof(struct inotify_event); - size_t name_len; + size_t name_len = 0; /* we get the inotify watch descriptor from the event private data */ spin_lock(&event->lock); @@ -200,8 +197,12 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, inotify_free_event_priv(fsn_priv); } - /* round up event->name_len so it is a multiple of event_size */ - name_len = roundup(event->name_len, event_size); + /* + * round up event->name_len so it is a multiple of event_size + * plus an extra byte for the terminating '\0'. + */ + if (event->name_len) + name_len = roundup(event->name_len + 1, event_size); inotify_event.len = name_len; inotify_event.mask = inotify_mask_to_arg(event->mask); @@ -225,8 +226,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, return -EFAULT; buf += event->name_len; - /* fill userspace with 0's from nul_inotify_event */ - if (copy_to_user(buf, &nul_inotify_event, len_to_zero)) + /* fill userspace with 0's */ + if (clear_user(buf, len_to_zero)) return -EFAULT; buf += len_to_zero; event_size += name_len; @@ -327,8 +328,9 @@ static long inotify_ioctl(struct file *file, unsigned int cmd, list_for_each_entry(holder, &group->notification_list, event_list) { event = holder->event; send_len += sizeof(struct inotify_event); - send_len += roundup(event->name_len, - sizeof(struct inotify_event)); + if (event->name_len) + send_len += roundup(event->name_len + 1, + sizeof(struct inotify_event)); } mutex_unlock(&group->notification_mutex); ret = put_user(send_len, (int __user *) p); @@ -366,20 +368,71 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns } /* - * Send IN_IGNORED for this wd, remove this wd from the idr, and drop the - * internal reference help on the mark because it is in the idr. + * Remove the mark from the idr (if present) and drop the reference + * on the mark because it was in the idr. + */ +static void inotify_remove_from_idr(struct fsnotify_group *group, + struct inotify_inode_mark_entry *ientry) +{ + struct idr *idr; + struct fsnotify_mark_entry *entry; + struct inotify_inode_mark_entry *found_ientry; + int wd; + + spin_lock(&group->inotify_data.idr_lock); + idr = &group->inotify_data.idr; + wd = ientry->wd; + + if (wd == -1) + goto out; + + entry = idr_find(&group->inotify_data.idr, wd); + if (unlikely(!entry)) + goto out; + + found_ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); + if (unlikely(found_ientry != ientry)) { + /* We found an entry in the idr with the right wd, but it's + * not the entry we were told to remove. eparis seriously + * fucked up somewhere. */ + WARN_ON(1); + ientry->wd = -1; + goto out; + } + + /* One ref for being in the idr, one ref held by the caller */ + BUG_ON(atomic_read(&entry->refcnt) < 2); + + idr_remove(idr, wd); + ientry->wd = -1; + + /* removed from the idr, drop that ref */ + fsnotify_put_mark(entry); +out: + spin_unlock(&group->inotify_data.idr_lock); +} + +/* + * Send IN_IGNORED for this wd, remove this wd from the idr. */ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, struct fsnotify_group *group) { struct inotify_inode_mark_entry *ientry; + struct fsnotify_event *ignored_event; struct inotify_event_private_data *event_priv; struct fsnotify_event_private_data *fsn_event_priv; - struct idr *idr; + int ret; + + ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, + FSNOTIFY_EVENT_NONE, NULL, 0, + GFP_NOFS); + if (!ignored_event) + return; ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); - event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL); + event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS); if (unlikely(!event_priv)) goto skip_send_ignore; @@ -388,22 +441,19 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, fsn_event_priv->group = group; event_priv->wd = ientry->wd; - fsnotify_add_notify_event(group, inotify_ignored_event, fsn_event_priv); - - /* did the private data get added? */ - if (list_empty(&fsn_event_priv->event_list)) + ret = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv); + if (ret) inotify_free_event_priv(fsn_event_priv); skip_send_ignore: + /* matches the reference taken when the event was created */ + fsnotify_put_event(ignored_event); + /* remove this entry from the idr */ - spin_lock(&group->inotify_data.idr_lock); - idr = &group->inotify_data.idr; - idr_remove(idr, ientry->wd); - spin_unlock(&group->inotify_data.idr_lock); + inotify_remove_from_idr(group, ientry); - /* removed from idr, drop that reference */ - fsnotify_put_mark(entry); + atomic_dec(&group->inotify_data.user->inotify_watches); } /* ding dong the mark is dead */ @@ -414,67 +464,29 @@ static void inotify_free_mark(struct fsnotify_mark_entry *entry) kmem_cache_free(inotify_inode_mark_cachep, ientry); } -static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg) +static int inotify_update_existing_watch(struct fsnotify_group *group, + struct inode *inode, + u32 arg) { - struct fsnotify_mark_entry *entry = NULL; + struct fsnotify_mark_entry *entry; struct inotify_inode_mark_entry *ientry; - int ret = 0; - int add = (arg & IN_MASK_ADD); - __u32 mask; __u32 old_mask, new_mask; + __u32 mask; + int add = (arg & IN_MASK_ADD); + int ret; /* don't allow invalid bits: we don't want flags set */ mask = inotify_arg_to_mask(arg); if (unlikely(!mask)) return -EINVAL; - ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL); - if (unlikely(!ientry)) - return -ENOMEM; - /* we set the mask at the end after attaching it */ - fsnotify_init_mark(&ientry->fsn_entry, inotify_free_mark); - ientry->wd = 0; - -find_entry: spin_lock(&inode->i_lock); entry = fsnotify_find_mark_entry(group, inode); spin_unlock(&inode->i_lock); - if (entry) { - kmem_cache_free(inotify_inode_mark_cachep, ientry); - ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); - } else { - if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) { - ret = -ENOSPC; - goto out_err; - } - - ret = fsnotify_add_mark(&ientry->fsn_entry, group, inode); - if (ret == -EEXIST) - goto find_entry; - else if (ret) - goto out_err; + if (!entry) + return -ENOENT; - entry = &ientry->fsn_entry; -retry: - ret = -ENOMEM; - if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL))) - goto out_err; - - spin_lock(&group->inotify_data.idr_lock); - /* if entry is added to the idr we keep the reference obtained - * through fsnotify_mark_add. remember to drop this reference - * when entry is removed from idr */ - ret = idr_get_new_above(&group->inotify_data.idr, entry, - ++group->inotify_data.last_wd, - &ientry->wd); - spin_unlock(&group->inotify_data.idr_lock); - if (ret) { - if (ret == -EAGAIN) - goto retry; - goto out_err; - } - atomic_inc(&group->inotify_data.user->inotify_watches); - } + ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); spin_lock(&entry->lock); @@ -506,14 +518,108 @@ retry: fsnotify_recalc_group_mask(group); } - return ientry->wd; + /* return the wd */ + ret = ientry->wd; -out_err: - /* see this isn't supposed to happen, just kill the watch */ - if (entry) { - fsnotify_destroy_mark_by_entry(entry); - fsnotify_put_mark(entry); + /* match the get from fsnotify_find_mark_entry() */ + fsnotify_put_mark(entry); + + return ret; +} + +static int inotify_new_watch(struct fsnotify_group *group, + struct inode *inode, + u32 arg) +{ + struct inotify_inode_mark_entry *tmp_ientry; + __u32 mask; + int ret; + + /* don't allow invalid bits: we don't want flags set */ + mask = inotify_arg_to_mask(arg); + if (unlikely(!mask)) + return -EINVAL; + + tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL); + if (unlikely(!tmp_ientry)) + return -ENOMEM; + + fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark); + tmp_ientry->fsn_entry.mask = mask; + tmp_ientry->wd = -1; + + ret = -ENOSPC; + if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) + goto out_err; +retry: + ret = -ENOMEM; + if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL))) + goto out_err; + + spin_lock(&group->inotify_data.idr_lock); + ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry, + group->inotify_data.last_wd, + &tmp_ientry->wd); + spin_unlock(&group->inotify_data.idr_lock); + if (ret) { + /* idr was out of memory allocate and try again */ + if (ret == -EAGAIN) + goto retry; + goto out_err; + } + + /* we put the mark on the idr, take a reference */ + fsnotify_get_mark(&tmp_ientry->fsn_entry); + + /* we are on the idr, now get on the inode */ + ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode); + if (ret) { + /* we failed to get on the inode, get off the idr */ + inotify_remove_from_idr(group, tmp_ientry); + goto out_err; } + + /* update the idr hint, who cares about races, it's just a hint */ + group->inotify_data.last_wd = tmp_ientry->wd; + + /* increment the number of watches the user has */ + atomic_inc(&group->inotify_data.user->inotify_watches); + + /* return the watch descriptor for this new entry */ + ret = tmp_ientry->wd; + + /* match the ref from fsnotify_init_markentry() */ + fsnotify_put_mark(&tmp_ientry->fsn_entry); + + /* if this mark added a new event update the group mask */ + if (mask & ~group->mask) + fsnotify_recalc_group_mask(group); + +out_err: + if (ret < 0) + kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry); + + return ret; +} + +static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg) +{ + int ret = 0; + +retry: + /* try to update and existing watch with the new arg */ + ret = inotify_update_existing_watch(group, inode, arg); + /* no mark present, try to add a new one */ + if (ret == -ENOENT) + ret = inotify_new_watch(group, inode, arg); + /* + * inotify_new_watch could race with another thread which did an + * inotify_new_watch between the update_existing and the add watch + * here, go back and try to update an existing mark again. + */ + if (ret == -EEXIST) + goto retry; + return ret; } @@ -532,7 +638,7 @@ static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsign spin_lock_init(&group->inotify_data.idr_lock); idr_init(&group->inotify_data.idr); - group->inotify_data.last_wd = 0; + group->inotify_data.last_wd = 1; group->inotify_data.user = user; group->inotify_data.fa = NULL; @@ -721,9 +827,6 @@ static int __init inotify_user_setup(void) inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC); event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC); - inotify_ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, FSNOTIFY_EVENT_NONE, NULL, 0); - if (!inotify_ignored_event) - panic("unable to allocate the inotify ignored event\n"); inotify_max_queued_events = 16384; inotify_max_user_instances = 128; diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 959b73e756fd..3816d5750dd5 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -136,18 +136,28 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new { if ((old->mask == new->mask) && (old->to_tell == new->to_tell) && - (old->data_type == new->data_type)) { + (old->data_type == new->data_type) && + (old->name_len == new->name_len)) { switch (old->data_type) { case (FSNOTIFY_EVENT_INODE): - if (old->inode == new->inode) + /* remember, after old was put on the wait_q we aren't + * allowed to look at the inode any more, only thing + * left to check was if the file_name is the same */ + if (old->name_len && + !strcmp(old->file_name, new->file_name)) return true; break; case (FSNOTIFY_EVENT_PATH): if ((old->path.mnt == new->path.mnt) && (old->path.dentry == new->path.dentry)) return true; + break; case (FSNOTIFY_EVENT_NONE): - return true; + if (old->mask & FS_Q_OVERFLOW) + return true; + else if (old->mask & FS_IN_IGNORED) + return false; + return false; }; } return false; @@ -165,9 +175,7 @@ int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_even struct list_head *list = &group->notification_list; struct fsnotify_event_holder *last_holder; struct fsnotify_event *last_event; - - /* easy to tell if priv was attached to the event */ - INIT_LIST_HEAD(&priv->event_list); + int ret = 0; /* * There is one fsnotify_event_holder embedded inside each fsnotify_event. @@ -188,6 +196,7 @@ alloc_holder: if (group->q_len >= group->max_events) { event = &q_overflow_event; + ret = -EOVERFLOW; /* sorry, no private data on the overflow event */ priv = NULL; } @@ -229,7 +238,7 @@ alloc_holder: mutex_unlock(&group->notification_mutex); wake_up(&group->notification_waitq); - return 0; + return ret; } /* @@ -339,18 +348,19 @@ static void initialize_event(struct fsnotify_event *event) * @name the filename, if available */ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data, - int data_type, const char *name, u32 cookie) + int data_type, const char *name, u32 cookie, + gfp_t gfp) { struct fsnotify_event *event; - event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL); + event = kmem_cache_alloc(fsnotify_event_cachep, gfp); if (!event) return NULL; initialize_event(event); if (name) { - event->file_name = kstrdup(name, GFP_KERNEL); + event->file_name = kstrdup(name, gfp); if (!event->file_name) { kmem_cache_free(fsnotify_event_cachep, event); return NULL; diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 3140a4429af1..4350d4993b18 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -2076,14 +2076,6 @@ err_out: *ppos = pos; if (cached_page) page_cache_release(cached_page); - /* For now, when the user asks for O_SYNC, we actually give O_DSYNC. */ - if (likely(!status)) { - if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(vi))) { - if (!mapping->a_ops->writepage || !is_sync_kiocb(iocb)) - status = generic_osync_inode(vi, mapping, - OSYNC_METADATA|OSYNC_DATA); - } - } pagevec_lru_add_file(&lru_pvec); ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", written ? "written" : "status", (unsigned long)written, @@ -2145,8 +2137,8 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, mutex_lock(&inode->i_mutex); ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); - if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - int err = sync_page_range(inode, mapping, pos, ret); + if (ret > 0) { + int err = generic_write_sync(file, pos, ret); if (err < 0) ret = err; } @@ -2173,8 +2165,8 @@ static ssize_t ntfs_file_writev(struct file *file, const struct iovec *iov, if (ret == -EIOCBQUEUED) ret = wait_on_sync_kiocb(&kiocb); mutex_unlock(&inode->i_mutex); - if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - int err = sync_page_range(inode, mapping, *ppos - ret, ret); + if (ret > 0) { + int err = generic_write_sync(file, *ppos - ret, ret); if (err < 0) ret = err; } diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 23bf68453d7d..1caa0ef0b2bb 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -384,13 +384,12 @@ unm_err_out: * it is dirty in the inode meta data rather than the data page cache of the * inode, and thus there are no data pages that need writing out. Therefore, a * full mark_inode_dirty() is overkill. A mark_inode_dirty_sync(), on the - * other hand, is not sufficient, because I_DIRTY_DATASYNC needs to be set to - * ensure ->write_inode is called from generic_osync_inode() and this needs to - * happen or the file data would not necessarily hit the device synchronously, - * even though the vfs inode has the O_SYNC flag set. Also, I_DIRTY_DATASYNC - * simply "feels" better than just I_DIRTY_SYNC, since the file data has not - * actually hit the block device yet, which is not what I_DIRTY_SYNC on its own - * would suggest. + * other hand, is not sufficient, because ->write_inode needs to be called even + * in case of fdatasync. This needs to happen or the file data would not + * necessarily hit the device synchronously, even though the vfs inode has the + * O_SYNC flag set. Also, I_DIRTY_DATASYNC simply "feels" better than just + * I_DIRTY_SYNC, since the file data has not actually hit the block device yet, + * which is not what I_DIRTY_SYNC on its own would suggest. */ void __mark_mft_record_dirty(ntfs_inode *ni) { diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 9edcde4974aa..ab513ddaeff2 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -1914,7 +1914,8 @@ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec, * immediately to their right. */ left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos); - if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) { + if (!ocfs2_rec_clusters(right_child_el, &right_child_el->l_recs[0])) { + BUG_ON(right_child_el->l_tree_depth); BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1); left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos); } @@ -2476,15 +2477,37 @@ out_ret_path: return ret; } -static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle, - struct ocfs2_path *path) +static int ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle, + int subtree_index, struct ocfs2_path *path) { - int i, idx; + int i, idx, ret; struct ocfs2_extent_rec *rec; struct ocfs2_extent_list *el; struct ocfs2_extent_block *eb; u32 range; + /* + * In normal tree rotation process, we will never touch the + * tree branch above subtree_index and ocfs2_extend_rotate_transaction + * doesn't reserve the credits for them either. + * + * But we do have a special case here which will update the rightmost + * records for all the bh in the path. + * So we have to allocate extra credits and access them. + */ + ret = ocfs2_extend_trans(handle, + handle->h_buffer_credits + subtree_index); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_path(inode, handle, path); + if (ret) { + mlog_errno(ret); + goto out; + } + /* Path should always be rightmost. */ eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; BUG_ON(eb->h_next_leaf_blk != 0ULL); @@ -2505,6 +2528,8 @@ static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle, ocfs2_journal_dirty(handle, path->p_node[i].bh); } +out: + return ret; } static void ocfs2_unlink_path(struct inode *inode, handle_t *handle, @@ -2717,7 +2742,12 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, if (del_right_subtree) { ocfs2_unlink_subtree(inode, handle, left_path, right_path, subtree_index, dealloc); - ocfs2_update_edge_lengths(inode, handle, left_path); + ret = ocfs2_update_edge_lengths(inode, handle, subtree_index, + left_path); + if (ret) { + mlog_errno(ret); + goto out; + } eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); @@ -3034,7 +3064,12 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, ocfs2_unlink_subtree(inode, handle, left_path, path, subtree_index, dealloc); - ocfs2_update_edge_lengths(inode, handle, left_path); + ret = ocfs2_update_edge_lengths(inode, handle, subtree_index, + left_path); + if (ret) { + mlog_errno(ret); + goto out; + } eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); @@ -6816,7 +6851,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, } status = 0; bail: - + brelse(last_eb_bh); mlog_exit(status); return status; } diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index b2c52b3a1484..8a1e61545f41 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -193,6 +193,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, (unsigned long long)OCFS2_I(inode)->ip_blkno); mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters); dump_stack(); + goto bail; } past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); @@ -894,18 +895,17 @@ struct ocfs2_write_cluster_desc { */ unsigned c_new; unsigned c_unwritten; + unsigned c_needs_zero; }; -static inline int ocfs2_should_zero_cluster(struct ocfs2_write_cluster_desc *d) -{ - return d->c_new || d->c_unwritten; -} - struct ocfs2_write_ctxt { /* Logical cluster position / len of write */ u32 w_cpos; u32 w_clen; + /* First cluster allocated in a nonsparse extend */ + u32 w_first_new_cpos; + struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE]; /* @@ -983,6 +983,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, return -ENOMEM; wc->w_cpos = pos >> osb->s_clustersize_bits; + wc->w_first_new_cpos = UINT_MAX; cend = (pos + len - 1) >> osb->s_clustersize_bits; wc->w_clen = cend - wc->w_cpos + 1; get_bh(di_bh); @@ -1217,20 +1218,18 @@ out: */ static int ocfs2_write_cluster(struct address_space *mapping, u32 phys, unsigned int unwritten, + unsigned int should_zero, struct ocfs2_alloc_context *data_ac, struct ocfs2_alloc_context *meta_ac, struct ocfs2_write_ctxt *wc, u32 cpos, loff_t user_pos, unsigned user_len) { - int ret, i, new, should_zero = 0; + int ret, i, new; u64 v_blkno, p_blkno; struct inode *inode = mapping->host; struct ocfs2_extent_tree et; new = phys == 0 ? 1 : 0; - if (new || unwritten) - should_zero = 1; - if (new) { u32 tmp_pos; @@ -1301,7 +1300,7 @@ static int ocfs2_write_cluster(struct address_space *mapping, if (tmpret) { mlog_errno(tmpret); if (ret == 0) - tmpret = ret; + ret = tmpret; } } @@ -1341,7 +1340,9 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping, local_len = osb->s_clustersize - cluster_off; ret = ocfs2_write_cluster(mapping, desc->c_phys, - desc->c_unwritten, data_ac, meta_ac, + desc->c_unwritten, + desc->c_needs_zero, + data_ac, meta_ac, wc, desc->c_cpos, pos, local_len); if (ret) { mlog_errno(ret); @@ -1391,14 +1392,14 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, * newly allocated cluster. */ desc = &wc->w_desc[0]; - if (ocfs2_should_zero_cluster(desc)) + if (desc->c_needs_zero) ocfs2_figure_cluster_boundaries(osb, desc->c_cpos, &wc->w_target_from, NULL); desc = &wc->w_desc[wc->w_clen - 1]; - if (ocfs2_should_zero_cluster(desc)) + if (desc->c_needs_zero) ocfs2_figure_cluster_boundaries(osb, desc->c_cpos, NULL, @@ -1466,13 +1467,28 @@ static int ocfs2_populate_write_desc(struct inode *inode, phys++; } + /* + * If w_first_new_cpos is < UINT_MAX, we have a non-sparse + * file that got extended. w_first_new_cpos tells us + * where the newly allocated clusters are so we can + * zero them. + */ + if (desc->c_cpos >= wc->w_first_new_cpos) { + BUG_ON(phys == 0); + desc->c_needs_zero = 1; + } + desc->c_phys = phys; if (phys == 0) { desc->c_new = 1; + desc->c_needs_zero = 1; *clusters_to_alloc = *clusters_to_alloc + 1; } - if (ext_flags & OCFS2_EXT_UNWRITTEN) + + if (ext_flags & OCFS2_EXT_UNWRITTEN) { desc->c_unwritten = 1; + desc->c_needs_zero = 1; + } num_clusters--; } @@ -1632,10 +1648,13 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos, if (newsize <= i_size_read(inode)) return 0; - ret = ocfs2_extend_no_holes(inode, newsize, newsize - len); + ret = ocfs2_extend_no_holes(inode, newsize, pos); if (ret) mlog_errno(ret); + wc->w_first_new_cpos = + ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)); + return ret; } @@ -1644,7 +1663,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, struct page **pagep, void **fsdata, struct buffer_head *di_bh, struct page *mmap_page) { - int ret, credits = OCFS2_INODE_UPDATE_CREDITS; + int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS; unsigned int clusters_to_alloc, extents_to_split; struct ocfs2_write_ctxt *wc; struct inode *inode = mapping->host; @@ -1722,8 +1741,19 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, } - ocfs2_set_target_boundaries(osb, wc, pos, len, - clusters_to_alloc + extents_to_split); + /* + * We have to zero sparse allocated clusters, unwritten extent clusters, + * and non-sparse clusters we just extended. For non-sparse writes, + * we know zeros will only be needed in the first and/or last cluster. + */ + if (clusters_to_alloc || extents_to_split || + (wc->w_clen && (wc->w_desc[0].c_needs_zero || + wc->w_desc[wc->w_clen - 1].c_needs_zero))) + cluster_of_pages = 1; + else + cluster_of_pages = 0; + + ocfs2_set_target_boundaries(osb, wc, pos, len, cluster_of_pages); handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { @@ -1756,8 +1786,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, * extent. */ ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, - clusters_to_alloc + extents_to_split, - mmap_page); + cluster_of_pages, mmap_page); if (ret) { mlog_errno(ret); goto out_quota; diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index b574431a031d..b4957c7d9fe2 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -85,6 +85,17 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, goto bail; } + /* + * If the last lookup failed to create dentry lock, let us + * redo it. + */ + if (!dentry->d_fsdata) { + mlog(0, "Inode %llu doesn't have dentry lock, " + "returning false\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + goto bail; + } + ret = 1; bail: @@ -310,22 +321,19 @@ out_attach: return ret; } -static DEFINE_SPINLOCK(dentry_list_lock); +DEFINE_SPINLOCK(dentry_list_lock); /* We limit the number of dentry locks to drop in one go. We have * this limit so that we don't starve other users of ocfs2_wq. */ #define DL_INODE_DROP_COUNT 64 /* Drop inode references from dentry locks */ -void ocfs2_drop_dl_inodes(struct work_struct *work) +static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count) { - struct ocfs2_super *osb = container_of(work, struct ocfs2_super, - dentry_lock_work); struct ocfs2_dentry_lock *dl; - int drop_count = DL_INODE_DROP_COUNT; spin_lock(&dentry_list_lock); - while (osb->dentry_lock_list && drop_count--) { + while (osb->dentry_lock_list && (drop_count < 0 || drop_count--)) { dl = osb->dentry_lock_list; osb->dentry_lock_list = dl->dl_next; spin_unlock(&dentry_list_lock); @@ -333,11 +341,32 @@ void ocfs2_drop_dl_inodes(struct work_struct *work) kfree(dl); spin_lock(&dentry_list_lock); } - if (osb->dentry_lock_list) + spin_unlock(&dentry_list_lock); +} + +void ocfs2_drop_dl_inodes(struct work_struct *work) +{ + struct ocfs2_super *osb = container_of(work, struct ocfs2_super, + dentry_lock_work); + + __ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT); + /* + * Don't queue dropping if umount is in progress. We flush the + * list in ocfs2_dismount_volume + */ + spin_lock(&dentry_list_lock); + if (osb->dentry_lock_list && + !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) queue_work(ocfs2_wq, &osb->dentry_lock_work); spin_unlock(&dentry_list_lock); } +/* Flush the whole work queue */ +void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb) +{ + __ocfs2_drop_dl_inodes(osb, -1); +} + /* * ocfs2_dentry_iput() and friends. * @@ -368,7 +397,8 @@ static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb, /* We leave dropping of inode reference to ocfs2_wq as that can * possibly lead to inode deletion which gets tricky */ spin_lock(&dentry_list_lock); - if (!osb->dentry_lock_list) + if (!osb->dentry_lock_list && + !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) queue_work(ocfs2_wq, &osb->dentry_lock_work); dl->dl_next = osb->dentry_lock_list; osb->dentry_lock_list = dl; diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h index faa12e75f98d..f5dd1789acf1 100644 --- a/fs/ocfs2/dcache.h +++ b/fs/ocfs2/dcache.h @@ -49,10 +49,13 @@ struct ocfs2_dentry_lock { int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode, u64 parent_blkno); +extern spinlock_t dentry_list_lock; + void ocfs2_dentry_lock_put(struct ocfs2_super *osb, struct ocfs2_dentry_lock *dl); void ocfs2_drop_dl_inodes(struct work_struct *work); +void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb); struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno, int skip_unhashed); diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index d07ddbe4b283..81eff8e58322 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -103,7 +103,6 @@ static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) lock->ast_pending, lock->ml.type); BUG(); } - BUG_ON(!list_empty(&lock->ast_list)); if (lock->ast_pending) mlog(0, "lock has an ast getting flushed right now\n"); diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c index 1c9efb406a96..02bf17808bdc 100644 --- a/fs/ocfs2/dlm/dlmfs.c +++ b/fs/ocfs2/dlm/dlmfs.c @@ -325,6 +325,7 @@ clear_fields: } static struct backing_dev_info dlmfs_backing_dev_info = { + .name = "ocfs2-dlmfs", .ra_pages = 0, /* No readahead */ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index bcb9260c3735..43e6e3280569 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1118,7 +1118,7 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, mlog(0, "%s:%.*s: sending mig lockres (%s) to %u\n", dlm->name, res->lockname.len, res->lockname.name, - orig_flags & DLM_MRES_MIGRATION ? "migrate" : "recovery", + orig_flags & DLM_MRES_MIGRATION ? "migration" : "recovery", send_to); /* send it */ diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index fcf879ed6930..756f5b0998e0 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -122,7 +122,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, * that still has AST's pending... */ in_use = !list_empty(&lock->ast_list); spin_unlock(&dlm->ast_lock); - if (in_use) { + if (in_use && !(flags & LKM_CANCEL)) { mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock " "while waiting for an ast!", res->lockname.len, res->lockname.name); @@ -131,7 +131,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, spin_lock(&res->spinlock); if (res->state & DLM_LOCK_RES_IN_PROGRESS) { - if (master_node) { + if (master_node && !(flags & LKM_CANCEL)) { mlog(ML_ERROR, "lockres in progress!\n"); spin_unlock(&res->spinlock); return DLM_FORWARD; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 62442e413a00..221c5e98957b 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1851,6 +1851,7 @@ relock: if (ret) goto out_dio; + count = ocount; ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode)); if (ret) @@ -1870,8 +1871,7 @@ relock: goto out_dio; } } else { - written = generic_file_aio_write_nolock(iocb, iov, nr_segs, - *ppos); + written = __generic_file_aio_write(iocb, iov, nr_segs, ppos); } out_dio: @@ -1879,18 +1879,21 @@ out_dio: BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) { - /* - * The generic write paths have handled getting data - * to disk, but since we don't make use of the dirty - * inode list, a manual journal commit is necessary - * here. - */ - if (old_size != i_size_read(inode) || - old_clusters != OCFS2_I(inode)->ip_clusters) { + ret = filemap_fdatawrite_range(file->f_mapping, pos, + pos + count - 1); + if (ret < 0) + written = ret; + + if (!ret && (old_size != i_size_read(inode) || + old_clusters != OCFS2_I(inode)->ip_clusters)) { ret = jbd2_journal_force_commit(osb->journal->j_journal); if (ret < 0) written = ret; } + + if (!ret) + ret = filemap_fdatawait_range(file->f_mapping, pos, + pos + count - 1); } /* @@ -1918,8 +1921,10 @@ out_sems: mutex_unlock(&inode->i_mutex); + if (written) + ret = written; mlog_exit(ret); - return written ? written : ret; + return ret; } static int ocfs2_splice_to_file(struct pipe_inode_info *pipe, @@ -1988,31 +1993,16 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, if (ret > 0) { unsigned long nr_pages; + int err; - *ppos += ret; nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - /* - * If file or inode is SYNC and we actually wrote some data, - * sync it. - */ - if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { - int err; - - mutex_lock(&inode->i_mutex); - err = ocfs2_rw_lock(inode, 1); - if (err < 0) { - mlog_errno(err); - } else { - err = generic_osync_inode(inode, mapping, - OSYNC_METADATA|OSYNC_DATA); - ocfs2_rw_unlock(inode, 1); - } - mutex_unlock(&inode->i_mutex); + err = generic_write_sync(out, *ppos, ret); + if (err) + ret = err; + else + *ppos += ret; - if (err) - ret = err; - } balance_dirty_pages_ratelimited_nr(mapping, nr_pages); } diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 9fcd36dcc9a0..467b413bec21 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -7,7 +7,6 @@ #include <linux/fs.h> #include <linux/mount.h> -#include <linux/smp_lock.h> #define MLOG_MASK_PREFIX ML_INODE #include <cluster/masklog.h> diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index f033760ecbea..c48b93ac6b65 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1954,10 +1954,16 @@ void ocfs2_orphan_scan_init(struct ocfs2_super *osb) os->os_osb = osb; os->os_count = 0; os->os_seqno = 0; - os->os_scantime = CURRENT_TIME; mutex_init(&os->os_lock); INIT_DELAYED_WORK(&os->os_orphan_scan_work, ocfs2_orphan_scan_work); +} +void ocfs2_orphan_scan_start(struct ocfs2_super *osb) +{ + struct ocfs2_orphan_scan *os; + + os = &osb->osb_orphan_scan; + os->os_scantime = CURRENT_TIME; if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb)) atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE); else { diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 5432c7f79cc6..2c3222aec622 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -145,6 +145,7 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb, /* Exported only for the journal struct init code in super.c. Do not call. */ void ocfs2_orphan_scan_init(struct ocfs2_super *osb); +void ocfs2_orphan_scan_start(struct ocfs2_super *osb); void ocfs2_orphan_scan_stop(struct ocfs2_super *osb); void ocfs2_orphan_scan_exit(struct ocfs2_super *osb); @@ -329,20 +330,27 @@ int ocfs2_journal_dirty(handle_t *handle, /* extended attribute block update */ #define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1 +/* Update of a single quota block */ +#define OCFS2_QUOTA_BLOCK_UPDATE_CREDITS 1 + /* global quotafile inode update, data block */ -#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) +#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \ + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS) +#define OCFS2_LOCAL_QINFO_WRITE_CREDITS OCFS2_QUOTA_BLOCK_UPDATE_CREDITS /* * The two writes below can accidentally see global info dirty due * to set_info() quotactl so make them prepared for the writes. */ /* quota data block, global info */ /* Write to local quota file */ -#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + 1) +#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \ + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS) /* global quota data block, local quota data block, global quota inode, * global quota info */ -#define OCFS2_QSYNC_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 3) +#define OCFS2_QSYNC_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \ + 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS) static inline int ocfs2_quota_trans_credits(struct super_block *sb) { @@ -355,11 +363,6 @@ static inline int ocfs2_quota_trans_credits(struct super_block *sb) return credits; } -/* Number of credits needed for removing quota structure from file */ -int ocfs2_calc_qdel_credits(struct super_block *sb, int type); -/* Number of credits needed for initialization of new quota structure */ -int ocfs2_calc_qinit_credits(struct super_block *sb, int type); - /* group extend. inode update and last group update. */ #define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index c9345ebb8493..39e1d5a39505 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -224,10 +224,12 @@ enum ocfs2_mount_options OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */ }; -#define OCFS2_OSB_SOFT_RO 0x0001 -#define OCFS2_OSB_HARD_RO 0x0002 -#define OCFS2_OSB_ERROR_FS 0x0004 -#define OCFS2_DEFAULT_ATIME_QUANTUM 60 +#define OCFS2_OSB_SOFT_RO 0x0001 +#define OCFS2_OSB_HARD_RO 0x0002 +#define OCFS2_OSB_ERROR_FS 0x0004 +#define OCFS2_OSB_DROP_DENTRY_LOCK_IMMED 0x0008 + +#define OCFS2_DEFAULT_ATIME_QUANTUM 60 struct ocfs2_journal; struct ocfs2_slot_info; @@ -490,6 +492,18 @@ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb, spin_unlock(&osb->osb_lock); } + +static inline unsigned long ocfs2_test_osb_flag(struct ocfs2_super *osb, + unsigned long flag) +{ + unsigned long ret; + + spin_lock(&osb->osb_lock); + ret = osb->osb_flags & flag; + spin_unlock(&osb->osb_lock); + return ret; +} + static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb, int hard) { diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h index fcdba091af3d..c212cf5a2bdf 100644 --- a/fs/ocfs2/ocfs2_lockid.h +++ b/fs/ocfs2/ocfs2_lockid.h @@ -108,6 +108,7 @@ static char *ocfs2_lock_type_strings[] = { [OCFS2_LOCK_TYPE_OPEN] = "Open", [OCFS2_LOCK_TYPE_FLOCK] = "Flock", [OCFS2_LOCK_TYPE_QINFO] = "Quota", + [OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync", [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan", }; diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h index 7365e2e08706..3fb96fcd4c81 100644 --- a/fs/ocfs2/quota.h +++ b/fs/ocfs2/quota.h @@ -50,7 +50,6 @@ struct ocfs2_mem_dqinfo { unsigned int dqi_chunks; /* Number of chunks in local quota file */ unsigned int dqi_blocks; /* Number of blocks allocated for local quota file */ unsigned int dqi_syncms; /* How often should we sync with other nodes */ - unsigned int dqi_syncjiff; /* Precomputed dqi_syncms in jiffies */ struct list_head dqi_chunk; /* List of chunks */ struct inode *dqi_gqinode; /* Global quota file inode */ struct ocfs2_lock_res dqi_gqlock; /* Lock protecting quota information structure */ diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index edfa60cd155c..44f2a5e1d042 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -23,6 +23,7 @@ #include "sysfile.h" #include "dlmglue.h" #include "uptodate.h" +#include "super.h" #include "quota.h" static struct workqueue_struct *ocfs2_quota_wq = NULL; @@ -69,6 +70,7 @@ static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot) d->dqb_curspace = cpu_to_le64(m->dqb_curspace); d->dqb_btime = cpu_to_le64(m->dqb_btime); d->dqb_itime = cpu_to_le64(m->dqb_itime); + d->dqb_pad1 = d->dqb_pad2 = 0; } static int ocfs2_global_is_id(void *dp, struct dquot *dquot) @@ -113,6 +115,15 @@ int ocfs2_read_quota_block(struct inode *inode, u64 v_block, int rc = 0; struct buffer_head *tmp = *bh; + if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) { + ocfs2_error(inode->i_sb, + "Quota file %llu is probably corrupted! Requested " + "to read block %Lu but file has size only %Lu\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)v_block, + (unsigned long long)i_size_read(inode)); + return -EIO; + } rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0, ocfs2_validate_quota_block); if (rc) @@ -211,14 +222,13 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type, mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA); if (gqinode->i_size < off + len) { - down_write(&OCFS2_I(gqinode)->ip_alloc_sem); - err = ocfs2_extend_no_holes(gqinode, off + len, off); - up_write(&OCFS2_I(gqinode)->ip_alloc_sem); - if (err < 0) - goto out; + loff_t rounded_end = + ocfs2_align_bytes_to_blocks(sb, off + len); + + /* Space is already allocated in ocfs2_global_read_dquot() */ err = ocfs2_simple_size_update(gqinode, oinfo->dqi_gqi_bh, - off + len); + rounded_end); if (err < 0) goto out; new = 1; @@ -234,7 +244,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type, } if (err) { mlog_errno(err); - return err; + goto out; } lock_buffer(bh); if (new) @@ -342,7 +352,6 @@ int ocfs2_global_read_info(struct super_block *sb, int type) info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace); info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace); oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms); - oinfo->dqi_syncjiff = msecs_to_jiffies(oinfo->dqi_syncms); oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks); oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk); oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry); @@ -352,7 +361,7 @@ int ocfs2_global_read_info(struct super_block *sb, int type) oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi); INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn); queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work, - oinfo->dqi_syncjiff); + msecs_to_jiffies(oinfo->dqi_syncms)); out_err: mlog_exit(status); @@ -402,13 +411,36 @@ int ocfs2_global_write_info(struct super_block *sb, int type) return err; } +static int ocfs2_global_qinit_alloc(struct super_block *sb, int type) +{ + struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; + + /* + * We may need to allocate tree blocks and a leaf block but not the + * root block + */ + return oinfo->dqi_gi.dqi_qtree_depth; +} + +static int ocfs2_calc_global_qinit_credits(struct super_block *sb, int type) +{ + /* We modify all the allocated blocks, tree root, and info block */ + return (ocfs2_global_qinit_alloc(sb, type) + 2) * + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS; +} + /* Read in information from global quota file and acquire a reference to it. * dquot_acquire() has already started the transaction and locked quota file */ int ocfs2_global_read_dquot(struct dquot *dquot) { int err, err2, ex = 0; - struct ocfs2_mem_dqinfo *info = - sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; + struct super_block *sb = dquot->dq_sb; + int type = dquot->dq_type; + struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv; + struct ocfs2_super *osb = OCFS2_SB(sb); + struct inode *gqinode = info->dqi_gqinode; + int need_alloc = ocfs2_global_qinit_alloc(sb, type); + handle_t *handle = NULL; err = ocfs2_qinfo_lock(info, 0); if (err < 0) @@ -419,14 +451,33 @@ int ocfs2_global_read_dquot(struct dquot *dquot) OCFS2_DQUOT(dquot)->dq_use_count++; OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace; OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes; + ocfs2_qinfo_unlock(info, 0); + if (!dquot->dq_off) { /* No real quota entry? */ - /* Upgrade to exclusive lock for allocation */ - ocfs2_qinfo_unlock(info, 0); - err = ocfs2_qinfo_lock(info, 1); - if (err < 0) - goto out_qlock; ex = 1; + /* + * Add blocks to quota file before we start a transaction since + * locking allocators ranks above a transaction start + */ + WARN_ON(journal_current_handle()); + down_write(&OCFS2_I(gqinode)->ip_alloc_sem); + err = ocfs2_extend_no_holes(gqinode, + gqinode->i_size + (need_alloc << sb->s_blocksize_bits), + gqinode->i_size); + up_write(&OCFS2_I(gqinode)->ip_alloc_sem); + if (err < 0) + goto out; } + + handle = ocfs2_start_trans(osb, + ocfs2_calc_global_qinit_credits(sb, type)); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto out; + } + err = ocfs2_qinfo_lock(info, ex); + if (err < 0) + goto out_trans; err = qtree_write_dquot(&info->dqi_gi, dquot); if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) { err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type); @@ -438,6 +489,9 @@ out_qlock: ocfs2_qinfo_unlock(info, 1); else ocfs2_qinfo_unlock(info, 0); +out_trans: + if (handle) + ocfs2_commit_trans(osb, handle); out: if (err < 0) mlog_errno(err); @@ -607,7 +661,7 @@ static void qsync_work_fn(struct work_struct *work) dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type); queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work, - oinfo->dqi_syncjiff); + msecs_to_jiffies(oinfo->dqi_syncms)); } /* @@ -635,20 +689,18 @@ out: return status; } -int ocfs2_calc_qdel_credits(struct super_block *sb, int type) +static int ocfs2_calc_qdel_credits(struct super_block *sb, int type) { - struct ocfs2_mem_dqinfo *oinfo; - int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, - OCFS2_FEATURE_RO_COMPAT_GRPQUOTA }; - - if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type])) - return 0; - - oinfo = sb_dqinfo(sb, type)->dqi_priv; - /* We modify tree, leaf block, global info, local chunk header, - * global and local inode */ - return oinfo->dqi_gi.dqi_qtree_depth + 2 + 1 + - 2 * OCFS2_INODE_UPDATE_CREDITS; + struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; + /* + * We modify tree, leaf block, global info, local chunk header, + * global and local inode; OCFS2_QINFO_WRITE_CREDITS already + * accounts for inode update + */ + return (oinfo->dqi_gi.dqi_qtree_depth + 2) * + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + + OCFS2_QINFO_WRITE_CREDITS + + OCFS2_INODE_UPDATE_CREDITS; } static int ocfs2_release_dquot(struct dquot *dquot) @@ -680,33 +732,10 @@ out: return status; } -int ocfs2_calc_qinit_credits(struct super_block *sb, int type) -{ - struct ocfs2_mem_dqinfo *oinfo; - int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, - OCFS2_FEATURE_RO_COMPAT_GRPQUOTA }; - struct ocfs2_dinode *lfe, *gfe; - - if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type])) - return 0; - - oinfo = sb_dqinfo(sb, type)->dqi_priv; - gfe = (struct ocfs2_dinode *)oinfo->dqi_gqi_bh->b_data; - lfe = (struct ocfs2_dinode *)oinfo->dqi_lqi_bh->b_data; - /* We can extend local file + global file. In local file we - * can modify info, chunk header block and dquot block. In - * global file we can modify info, tree and leaf block */ - return ocfs2_calc_extend_credits(sb, &lfe->id2.i_list, 0) + - ocfs2_calc_extend_credits(sb, &gfe->id2.i_list, 0) + - 3 + oinfo->dqi_gi.dqi_qtree_depth + 2; -} - static int ocfs2_acquire_dquot(struct dquot *dquot) { - handle_t *handle; struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; - struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb); int status = 0; mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type); @@ -715,16 +744,7 @@ static int ocfs2_acquire_dquot(struct dquot *dquot) status = ocfs2_lock_global_qf(oinfo, 1); if (status < 0) goto out; - handle = ocfs2_start_trans(osb, - ocfs2_calc_qinit_credits(dquot->dq_sb, dquot->dq_type)); - if (IS_ERR(handle)) { - status = PTR_ERR(handle); - mlog_errno(status); - goto out_ilock; - } status = dquot_acquire(dquot); - ocfs2_commit_trans(osb, handle); -out_ilock: ocfs2_unlock_global_qf(oinfo, 1); out: mlog_exit(status); diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 5a460fa82553..bdb09cb6e1fe 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -20,6 +20,7 @@ #include "sysfile.h" #include "dlmglue.h" #include "quota.h" +#include "uptodate.h" /* Number of local quota structures per block */ static inline unsigned int ol_quota_entries_per_block(struct super_block *sb) @@ -100,7 +101,8 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh, handle_t *handle; int status; - handle = ocfs2_start_trans(OCFS2_SB(sb), 1); + handle = ocfs2_start_trans(OCFS2_SB(sb), + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); @@ -610,7 +612,8 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, goto out_bh; /* Mark quota file as clean if we are recovering quota file of * some other node. */ - handle = ocfs2_start_trans(osb, 1); + handle = ocfs2_start_trans(osb, + OCFS2_LOCAL_QINFO_WRITE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); @@ -940,7 +943,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( struct ocfs2_local_disk_chunk *dchunk; int status; handle_t *handle; - struct buffer_head *bh = NULL; + struct buffer_head *bh = NULL, *dbh = NULL; u64 p_blkno; /* We are protected by dqio_sem so no locking needed */ @@ -964,32 +967,35 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( mlog_errno(status); goto out; } + /* Local quota info and two new blocks we initialize */ + handle = ocfs2_start_trans(OCFS2_SB(sb), + OCFS2_LOCAL_QINFO_WRITE_CREDITS + + 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out; + } + /* Initialize chunk header */ down_read(&OCFS2_I(lqinode)->ip_alloc_sem); status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks, &p_blkno, NULL, NULL); up_read(&OCFS2_I(lqinode)->ip_alloc_sem); if (status < 0) { mlog_errno(status); - goto out; + goto out_trans; } bh = sb_getblk(sb, p_blkno); if (!bh) { status = -ENOMEM; mlog_errno(status); - goto out; + goto out_trans; } dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; - - handle = ocfs2_start_trans(OCFS2_SB(sb), 2); - if (IS_ERR(handle)) { - status = PTR_ERR(handle); - mlog_errno(status); - goto out; - } - + ocfs2_set_new_buffer_uptodate(lqinode, bh); status = ocfs2_journal_access_dq(handle, lqinode, bh, - OCFS2_JOURNAL_ACCESS_WRITE); + OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); goto out_trans; @@ -999,7 +1005,6 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( memset(dchunk->dqc_bitmap, 0, sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) - OCFS2_QBLK_RESERVED_SPACE); - set_buffer_uptodate(bh); unlock_buffer(bh); status = ocfs2_journal_dirty(handle, bh); if (status < 0) { @@ -1007,6 +1012,38 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( goto out_trans; } + /* Initialize new block with structures */ + down_read(&OCFS2_I(lqinode)->ip_alloc_sem); + status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks + 1, + &p_blkno, NULL, NULL); + up_read(&OCFS2_I(lqinode)->ip_alloc_sem); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + dbh = sb_getblk(sb, p_blkno); + if (!dbh) { + status = -ENOMEM; + mlog_errno(status); + goto out_trans; + } + ocfs2_set_new_buffer_uptodate(lqinode, dbh); + status = ocfs2_journal_access_dq(handle, lqinode, dbh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + lock_buffer(dbh); + memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE); + unlock_buffer(dbh); + status = ocfs2_journal_dirty(handle, dbh); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + + /* Update local quotafile info */ oinfo->dqi_blocks += 2; oinfo->dqi_chunks++; status = ocfs2_local_write_info(sb, type); @@ -1031,6 +1068,7 @@ out_trans: ocfs2_commit_trans(OCFS2_SB(sb), handle); out: brelse(bh); + brelse(dbh); kmem_cache_free(ocfs2_qf_chunk_cachep, chunk); return ERR_PTR(status); } @@ -1048,6 +1086,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( struct ocfs2_local_disk_chunk *dchunk; int epb = ol_quota_entries_per_block(sb); unsigned int chunk_blocks; + struct buffer_head *bh; + u64 p_blkno; int status; handle_t *handle; @@ -1075,12 +1115,49 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( mlog_errno(status); goto out; } - handle = ocfs2_start_trans(OCFS2_SB(sb), 2); + + /* Get buffer from the just added block */ + down_read(&OCFS2_I(lqinode)->ip_alloc_sem); + status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks, + &p_blkno, NULL, NULL); + up_read(&OCFS2_I(lqinode)->ip_alloc_sem); + if (status < 0) { + mlog_errno(status); + goto out; + } + bh = sb_getblk(sb, p_blkno); + if (!bh) { + status = -ENOMEM; + mlog_errno(status); + goto out; + } + ocfs2_set_new_buffer_uptodate(lqinode, bh); + + /* Local quota info, chunk header and the new block we initialize */ + handle = ocfs2_start_trans(OCFS2_SB(sb), + OCFS2_LOCAL_QINFO_WRITE_CREDITS + + 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); goto out; } + /* Zero created block */ + status = ocfs2_journal_access_dq(handle, lqinode, bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + lock_buffer(bh); + memset(bh->b_data, 0, sb->s_blocksize); + unlock_buffer(bh); + status = ocfs2_journal_dirty(handle, bh); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + /* Update chunk header */ status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { @@ -1097,6 +1174,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( mlog_errno(status); goto out_trans; } + /* Update file header */ oinfo->dqi_blocks++; status = ocfs2_local_write_info(sb, type); if (status < 0) { diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index 3f661376a2de..e49c41050264 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -17,6 +17,7 @@ * General Public License for more details. */ +#include <linux/kernel.h> #include <linux/crc32.h> #include <linux/module.h> @@ -153,7 +154,7 @@ static int status_map[] = { static int dlm_status_to_errno(enum dlm_status status) { - BUG_ON(status > (sizeof(status_map) / sizeof(status_map[0]))); + BUG_ON(status < 0 || status >= ARRAY_SIZE(status_map)); return status_map[status]; } diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 7efb349fb9bd..a3f8871d21fd 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -777,6 +777,7 @@ static int ocfs2_sb_probe(struct super_block *sb, } di = (struct ocfs2_dinode *) (*bh)->b_data; memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats)); + spin_lock_init(&stats->b_lock); status = ocfs2_verify_volume(di, *bh, blksize, stats); if (status >= 0) goto bail; @@ -1182,7 +1183,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) wake_up(&osb->osb_mount_event); /* Start this when the mount is almost sure of being successful */ - ocfs2_orphan_scan_init(osb); + ocfs2_orphan_scan_start(osb); mlog_exit(status); return status; @@ -1213,14 +1214,31 @@ static int ocfs2_get_sb(struct file_system_type *fs_type, mnt); } +static void ocfs2_kill_sb(struct super_block *sb) +{ + struct ocfs2_super *osb = OCFS2_SB(sb); + + /* Failed mount? */ + if (!osb || atomic_read(&osb->vol_state) == VOLUME_DISABLED) + goto out; + + /* Prevent further queueing of inode drop events */ + spin_lock(&dentry_list_lock); + ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED); + spin_unlock(&dentry_list_lock); + /* Wait for work to finish and/or remove it */ + cancel_work_sync(&osb->dentry_lock_work); +out: + kill_block_super(sb); +} + static struct file_system_type ocfs2_fs_type = { .owner = THIS_MODULE, .name = "ocfs2", .get_sb = ocfs2_get_sb, /* is this called when we mount * the fs? */ - .kill_sb = kill_block_super, /* set to the generic one - * right now, but do we - * need to change that? */ + .kill_sb = ocfs2_kill_sb, + .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, .next = NULL }; @@ -1819,6 +1837,12 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) debugfs_remove(osb->osb_ctxt); + /* + * Flush inode dropping work queue so that deletes are + * performed while the filesystem is still working + */ + ocfs2_drop_all_dl_inodes(osb); + /* Orphan scan should be stopped as early as possible */ ocfs2_orphan_scan_stop(osb); @@ -1981,6 +2005,8 @@ static int ocfs2_initialize_super(struct super_block *sb, snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); + ocfs2_orphan_scan_init(osb); + status = ocfs2_recovery_init(osb); if (status) { mlog(ML_ERROR, "Unable to initialize recovery state\n"); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index ba320e250747..d1a27cda984f 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -1052,7 +1052,8 @@ static int ocfs2_xattr_block_get(struct inode *inode, struct ocfs2_xattr_block *xb; struct ocfs2_xattr_value_root *xv; size_t size; - int ret = -ENODATA, name_offset, name_len, block_off, i; + int ret = -ENODATA, name_offset, name_len, i; + int uninitialized_var(block_off); xs->bucket = ocfs2_xattr_bucket_new(inode); if (!xs->bucket) { diff --git a/fs/open.c b/fs/open.c index dd98e8076024..31191bf513e4 100644 --- a/fs/open.c +++ b/fs/open.c @@ -199,7 +199,7 @@ out: int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, struct file *filp) { - int err; + int ret; struct iattr newattrs; /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */ @@ -214,12 +214,14 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, } /* Remove suid/sgid on truncate too */ - newattrs.ia_valid |= should_remove_suid(dentry); + ret = should_remove_suid(dentry); + if (ret) + newattrs.ia_valid |= ret | ATTR_FORCE; mutex_lock(&dentry->d_inode->i_mutex); - err = notify_change(dentry, &newattrs); + ret = notify_change(dentry, &newattrs); mutex_unlock(&dentry->d_inode->i_mutex); - return err; + return ret; } static long do_sys_truncate(const char __user *pathname, loff_t length) @@ -957,6 +959,8 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags, int error; struct file *f; + validate_creds(cred); + /* * We must always pass in a valid mount pointer. Historically * callers got away with not passing it, but we must enforce this at diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 1a9c7878f864..fbeaddf595d3 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -248,11 +248,19 @@ ssize_t part_stat_show(struct device *dev, part_stat_read(p, merges[WRITE]), (unsigned long long)part_stat_read(p, sectors[WRITE]), jiffies_to_msecs(part_stat_read(p, ticks[WRITE])), - p->in_flight, + part_in_flight(p), jiffies_to_msecs(part_stat_read(p, io_ticks)), jiffies_to_msecs(part_stat_read(p, time_in_queue))); } +ssize_t part_inflight_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%8u %8u\n", p->in_flight[0], p->in_flight[1]); +} + #ifdef CONFIG_FAIL_MAKE_REQUEST ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -281,6 +289,7 @@ static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); +static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); @@ -292,6 +301,7 @@ static struct attribute *part_attrs[] = { &dev_attr_size.attr, &dev_attr_alignment_offset.attr, &dev_attr_stat.attr, + &dev_attr_inflight.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif @@ -302,7 +312,7 @@ static struct attribute_group part_attr_group = { .attrs = part_attrs, }; -static struct attribute_group *part_attr_groups[] = { +static const struct attribute_group *part_attr_groups[] = { &part_attr_group, #ifdef CONFIG_BLK_DEV_IO_TRACE &blk_trace_attr_group, @@ -436,7 +446,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, rcu_assign_pointer(ptbl->part[partno], p); /* suppress uevent if the disk supresses it */ - if (!dev_get_uevent_suppress(pdev)) + if (!dev_get_uevent_suppress(ddev)) kobject_uevent(&pdev->kobj, KOBJ_ADD); return p; diff --git a/fs/pipe.c b/fs/pipe.c index f7dd21ad85a6..52c415114838 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -68,8 +68,8 @@ void pipe_double_lock(struct pipe_inode_info *pipe1, pipe_lock_nested(pipe1, I_MUTEX_PARENT); pipe_lock_nested(pipe2, I_MUTEX_CHILD); } else { - pipe_lock_nested(pipe2, I_MUTEX_CHILD); - pipe_lock_nested(pipe1, I_MUTEX_PARENT); + pipe_lock_nested(pipe2, I_MUTEX_PARENT); + pipe_lock_nested(pipe1, I_MUTEX_CHILD); } } diff --git a/fs/proc/base.c b/fs/proc/base.c index 3ce5ae9e3d2d..6f742f6658a9 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -234,23 +234,20 @@ static int check_mem_permission(struct task_struct *task) struct mm_struct *mm_for_maps(struct task_struct *task) { - struct mm_struct *mm = get_task_mm(task); - if (!mm) + struct mm_struct *mm; + + if (mutex_lock_killable(&task->cred_guard_mutex)) return NULL; - down_read(&mm->mmap_sem); - task_lock(task); - if (task->mm != mm) - goto out; - if (task->mm != current->mm && - __ptrace_may_access(task, PTRACE_MODE_READ) < 0) - goto out; - task_unlock(task); + + mm = get_task_mm(task); + if (mm && mm != current->mm && + !ptrace_may_access(task, PTRACE_MODE_READ)) { + mmput(mm); + mm = NULL; + } + mutex_unlock(&task->cred_guard_mutex); + return mm; -out: - task_unlock(task); - up_read(&mm->mmap_sem); - mmput(mm); - return NULL; } static int proc_pid_cmdline(struct task_struct *task, char * buffer) @@ -1006,12 +1003,7 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf, if (!task) return -ESRCH; - task_lock(task); - if (task->mm) - oom_adjust = task->mm->oom_adj; - else - oom_adjust = OOM_DISABLE; - task_unlock(task); + oom_adjust = task->oomkilladj; put_task_struct(task); len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); @@ -1040,19 +1032,11 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, task = get_proc_task(file->f_path.dentry->d_inode); if (!task) return -ESRCH; - task_lock(task); - if (!task->mm) { - task_unlock(task); - put_task_struct(task); - return -EINVAL; - } - if (oom_adjust < task->mm->oom_adj && !capable(CAP_SYS_RESOURCE)) { - task_unlock(task); + if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) { put_task_struct(task); return -EACCES; } - task->mm->oom_adj = oom_adjust; - task_unlock(task); + task->oomkilladj = oom_adjust; put_task_struct(task); if (end - buffer == 0) return -EIO; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 6f61b7cc32e0..9bd8be1d235c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -119,6 +119,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) mm = mm_for_maps(priv->task); if (!mm) return NULL; + down_read(&mm->mmap_sem); tail_vma = get_gate_vma(priv->task); priv->tail_vma = tail_vma; diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 64a72e2e7650..8f5c05d3dbd3 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -189,6 +189,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) priv->task = NULL; return NULL; } + down_read(&mm->mmap_sem); /* start from the Nth VMA */ for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 607c579e5eca..38f7bd559f35 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2042,7 +2042,6 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, * changes */ invalidate_bdev(sb->s_bdev); } - mutex_lock(&inode->i_mutex); mutex_lock(&dqopt->dqonoff_mutex); if (sb_has_quota_loaded(sb, type)) { error = -EBUSY; @@ -2054,9 +2053,11 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, * possible) Also nobody should write to the file - we use * special IO operations which ignore the immutable bit. */ down_write(&dqopt->dqptr_sem); + mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA); inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE; + mutex_unlock(&inode->i_mutex); up_write(&dqopt->dqptr_sem); sb->dq_op->drop(inode); } @@ -2080,7 +2081,6 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, goto out_file_init; } mutex_unlock(&dqopt->dqio_mutex); - mutex_unlock(&inode->i_mutex); spin_lock(&dq_state_lock); dqopt->flags |= dquot_state_flag(flags, type); spin_unlock(&dq_state_lock); @@ -2094,16 +2094,17 @@ out_file_init: dqopt->files[type] = NULL; iput(inode); out_lock: - mutex_unlock(&dqopt->dqonoff_mutex); if (oldflags != -1) { down_write(&dqopt->dqptr_sem); + mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); /* Set the flags back (in the case of accidental quotaon() * on a wrong file we don't want to mess up the flags) */ inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE); inode->i_flags |= oldflags; + mutex_unlock(&inode->i_mutex); up_write(&dqopt->dqptr_sem); } - mutex_unlock(&inode->i_mutex); + mutex_unlock(&dqopt->dqonoff_mutex); out_fmt: put_quota_format(fmt); diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index ebb2c417912c..11f0c06316de 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -20,6 +20,7 @@ #include <linux/ramfs.h> #include <linux/pagevec.h> #include <linux/mman.h> +#include <linux/sched.h> #include <asm/uaccess.h> #include "internal.h" diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 0ff7566c767c..a7f0110fca4c 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -46,6 +46,7 @@ static const struct super_operations ramfs_ops; static const struct inode_operations ramfs_dir_inode_operations; static struct backing_dev_info ramfs_backing_dev_info = { + .name = "ramfs", .ra_pages = 0, /* No readahead */ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY | diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 77f5bb746bf0..90622200b39c 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -997,7 +997,7 @@ static int reiserfs_async_progress_wait(struct super_block *s) DEFINE_WAIT(wait); struct reiserfs_journal *j = SB_JOURNAL(s); if (atomic_read(&j->j_async_throttle)) - congestion_wait(WRITE, HZ / 10); + congestion_wait(BLK_RW_ASYNC, HZ / 10); return 0; } diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index d3aeb061612b..7adea74d6a8a 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -24,7 +24,6 @@ #include <linux/exportfs.h> #include <linux/quotaops.h> #include <linux/vfs.h> -#include <linux/mnt_namespace.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/crc32.h> diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index f3d47d856848..6925b835a43b 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -46,7 +46,6 @@ #include <linux/reiserfs_acl.h> #include <asm/uaccess.h> #include <net/checksum.h> -#include <linux/smp_lock.h> #include <linux/stat.h> #include <linux/quotaops.h> diff --git a/fs/select.c b/fs/select.c index d870237e42c7..8084834e123e 100644 --- a/fs/select.c +++ b/fs/select.c @@ -110,6 +110,7 @@ void poll_initwait(struct poll_wqueues *pwq) { init_poll_funcptr(&pwq->pt, __pollwait); pwq->polling_task = current; + pwq->triggered = 0; pwq->error = 0; pwq->table = NULL; pwq->inline_index = 0; diff --git a/fs/splice.c b/fs/splice.c index 73766d24f97b..7394e9e17534 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -502,8 +502,10 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, len = left; ret = __generic_file_splice_read(in, ppos, pipe, len, flags); - if (ret > 0) + if (ret > 0) { *ppos += ret; + file_accessed(in); + } return ret; } @@ -963,8 +965,10 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); ret = file_remove_suid(out); - if (!ret) + if (!ret) { + file_update_time(out); ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file); + } mutex_unlock(&inode->i_mutex); } while (ret > 0); splice_from_pipe_end(pipe, &sd); @@ -976,25 +980,15 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, if (ret > 0) { unsigned long nr_pages; + int err; - *ppos += ret; nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - /* - * If file or inode is SYNC and we actually wrote some data, - * sync it. - */ - if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { - int err; - - mutex_lock(&inode->i_mutex); - err = generic_osync_inode(inode, mapping, - OSYNC_METADATA|OSYNC_DATA); - mutex_unlock(&inode->i_mutex); - - if (err) - ret = err; - } + err = generic_write_sync(out, *ppos, ret); + if (err) + ret = err; + else + *ppos += ret; balance_dirty_pages_ratelimited_nr(mapping, nr_pages); } diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 3b52770f46ff..cb5fc57e370b 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -30,6 +30,7 @@ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/slab.h> +#include <linux/smp_lock.h> #include <linux/mutex.h> #include <linux/pagemap.h> #include <linux/init.h> diff --git a/fs/super.c b/fs/super.c index 2761d3e22ed9..b03fea8fbfb6 100644 --- a/fs/super.c +++ b/fs/super.c @@ -62,9 +62,6 @@ static struct super_block *alloc_super(struct file_system_type *type) s = NULL; goto out; } - INIT_LIST_HEAD(&s->s_dirty); - INIT_LIST_HEAD(&s->s_io); - INIT_LIST_HEAD(&s->s_more_io); INIT_LIST_HEAD(&s->s_files); INIT_LIST_HEAD(&s->s_instances); INIT_HLIST_HEAD(&s->s_anon); @@ -171,7 +168,7 @@ int __put_super_and_need_restart(struct super_block *sb) * Drops a temporary reference, frees superblock if there's no * references left. */ -static void put_super(struct super_block *sb) +void put_super(struct super_block *sb) { spin_lock(&sb_lock); __put_super(sb); @@ -710,6 +707,12 @@ static int set_bdev_super(struct super_block *s, void *data) { s->s_bdev = data; s->s_dev = s->s_bdev->bd_dev; + + /* + * We set the bdi here to the queue backing, file systems can + * overwrite this in ->fill_super() + */ + s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info; return 0; } diff --git a/fs/sync.c b/fs/sync.c index dd200025af85..c08467a5d7cb 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -19,20 +19,29 @@ SYNC_FILE_RANGE_WAIT_AFTER) /* - * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0) - * just dirties buffers with inodes so we have to submit IO for these buffers - * via __sync_blockdev(). This also speeds up the wait == 1 case since in that - * case write_inode() functions do sync_dirty_buffer() and thus effectively - * write one block at a time. + * Do the filesystem syncing work. For simple filesystems + * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to + * submit IO for these buffers via __sync_blockdev(). This also speeds up the + * wait == 1 case since in that case write_inode() functions do + * sync_dirty_buffer() and thus effectively write one block at a time. */ static int __sync_filesystem(struct super_block *sb, int wait) { + /* + * This should be safe, as we require bdi backing to actually + * write out data in the first place + */ + if (!sb->s_bdi) + return 0; + /* Avoid doing twice syncing and cache pruning for quota sync */ - if (!wait) + if (!wait) { writeout_quota_sb(sb, -1); - else + writeback_inodes_sb(sb); + } else { sync_quota_sb(sb, -1); - sync_inodes_sb(sb, wait); + sync_inodes_sb(sb); + } if (sb->s_op->sync_fs) sb->s_op->sync_fs(sb, wait); return __sync_blockdev(sb->s_bdev, wait); @@ -99,7 +108,7 @@ restart: spin_unlock(&sb_lock); down_read(&sb->s_umount); - if (!(sb->s_flags & MS_RDONLY) && sb->s_root) + if (!(sb->s_flags & MS_RDONLY) && sb->s_root && sb->s_bdi) __sync_filesystem(sb, wait); up_read(&sb->s_umount); @@ -112,8 +121,13 @@ restart: mutex_unlock(&mutex); } +/* + * sync everything. Start out by waking pdflush, because that writes back + * all queues in parallel. + */ SYSCALL_DEFINE0(sync) { + wakeup_flusher_threads(0); sync_filesystems(0); sync_filesystems(1); if (unlikely(laptop_mode)) @@ -171,19 +185,23 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync) } /** - * vfs_fsync - perform a fsync or fdatasync on a file + * vfs_fsync_range - helper to sync a range of data & metadata to disk * @file: file to sync * @dentry: dentry of @file - * @data: only perform a fdatasync operation + * @start: offset in bytes of the beginning of data range to sync + * @end: offset in bytes of the end of data range (inclusive) + * @datasync: perform only datasync * - * Write back data and metadata for @file to disk. If @datasync is - * set only metadata needed to access modified file data is written. + * Write back data in range @start..@end and metadata for @file to disk. If + * @datasync is set only metadata needed to access modified file data is + * written. * * In case this function is called from nfsd @file may be %NULL and * only @dentry is set. This can only happen when the filesystem * implements the export_operations API. */ -int vfs_fsync(struct file *file, struct dentry *dentry, int datasync) +int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start, + loff_t end, int datasync) { const struct file_operations *fop; struct address_space *mapping; @@ -207,7 +225,7 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync) goto out; } - ret = filemap_fdatawrite(mapping); + ret = filemap_write_and_wait_range(mapping, start, end); /* * We need to protect against concurrent writers, which could cause @@ -218,12 +236,29 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync) if (!ret) ret = err; mutex_unlock(&mapping->host->i_mutex); - err = filemap_fdatawait(mapping); - if (!ret) - ret = err; + out: return ret; } +EXPORT_SYMBOL(vfs_fsync_range); + +/** + * vfs_fsync - perform a fsync or fdatasync on a file + * @file: file to sync + * @dentry: dentry of @file + * @datasync: only perform a fdatasync operation + * + * Write back data and metadata for @file to disk. If @datasync is + * set only metadata needed to access modified file data is written. + * + * In case this function is called from nfsd @file may be %NULL and + * only @dentry is set. This can only happen when the filesystem + * implements the export_operations API. + */ +int vfs_fsync(struct file *file, struct dentry *dentry, int datasync) +{ + return vfs_fsync_range(file, dentry, 0, LLONG_MAX, datasync); +} EXPORT_SYMBOL(vfs_fsync); static int do_fsync(unsigned int fd, int datasync) @@ -249,6 +284,23 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd) return do_fsync(fd, 1); } +/** + * generic_write_sync - perform syncing after a write if file / inode is sync + * @file: file to which the write happened + * @pos: offset where the write started + * @count: length of the write + * + * This is just a simple wrapper about our general syncing function. + */ +int generic_write_sync(struct file *file, loff_t pos, loff_t count) +{ + if (!(file->f_flags & O_SYNC) && !IS_SYNC(file->f_mapping->host)) + return 0; + return vfs_fsync_range(file, file->f_path.dentry, pos, + pos + count - 1, 1); +} +EXPORT_SYMBOL(generic_write_sync); + /* * sys_sync_file_range() permits finely controlled syncing over a segment of * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index 9345806c8853..2524714bece1 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -171,6 +171,7 @@ static ssize_t write(struct file *file, const char __user *userbuf, if (count > 0) *off = offs + count; + kfree(temp); return count; } diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index d88d0fac9fa5..0050fc40e8c9 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -760,6 +760,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, const struct inode_operations sysfs_dir_inode_operations = { .lookup = sysfs_lookup, .setattr = sysfs_setattr, + .setxattr = sysfs_setxattr, }; static void remove_dir(struct sysfs_dirent *sd) @@ -939,8 +940,10 @@ again: /* Remove from old parent's list and insert into new parent's list. */ sysfs_unlink_sibling(sd); sysfs_get(new_parent_sd); + drop_nlink(old_parent->d_inode); sysfs_put(sd->s_parent); sd->s_parent = new_parent_sd; + inc_nlink(new_parent->d_inode); sysfs_link_sibling(sd); out_unlock: diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 555f0ff988df..e28cecf179f5 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -18,6 +18,8 @@ #include <linux/capability.h> #include <linux/errno.h> #include <linux/sched.h> +#include <linux/xattr.h> +#include <linux/security.h> #include "sysfs.h" extern struct super_block * sysfs_sb; @@ -29,12 +31,14 @@ static const struct address_space_operations sysfs_aops = { }; static struct backing_dev_info sysfs_backing_dev_info = { + .name = "sysfs", .ra_pages = 0, /* No readahead */ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; static const struct inode_operations sysfs_inode_operations ={ .setattr = sysfs_setattr, + .setxattr = sysfs_setxattr, }; int __init sysfs_inode_init(void) @@ -42,18 +46,37 @@ int __init sysfs_inode_init(void) return bdi_init(&sysfs_backing_dev_info); } +struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd) +{ + struct sysfs_inode_attrs *attrs; + struct iattr *iattrs; + + attrs = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL); + if (!attrs) + return NULL; + iattrs = &attrs->ia_iattr; + + /* assign default attributes */ + iattrs->ia_mode = sd->s_mode; + iattrs->ia_uid = 0; + iattrs->ia_gid = 0; + iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME; + + return attrs; +} int sysfs_setattr(struct dentry * dentry, struct iattr * iattr) { struct inode * inode = dentry->d_inode; struct sysfs_dirent * sd = dentry->d_fsdata; - struct iattr * sd_iattr; + struct sysfs_inode_attrs *sd_attrs; + struct iattr *iattrs; unsigned int ia_valid = iattr->ia_valid; int error; if (!sd) return -EINVAL; - sd_iattr = sd->s_iattr; + sd_attrs = sd->s_iattr; error = inode_change_ok(inode, iattr); if (error) @@ -65,42 +88,77 @@ int sysfs_setattr(struct dentry * dentry, struct iattr * iattr) if (error) return error; - if (!sd_iattr) { + if (!sd_attrs) { /* setting attributes for the first time, allocate now */ - sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL); - if (!sd_iattr) + sd_attrs = sysfs_init_inode_attrs(sd); + if (!sd_attrs) return -ENOMEM; - /* assign default attributes */ - sd_iattr->ia_mode = sd->s_mode; - sd_iattr->ia_uid = 0; - sd_iattr->ia_gid = 0; - sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME; - sd->s_iattr = sd_iattr; + sd->s_iattr = sd_attrs; + } else { + /* attributes were changed at least once in past */ + iattrs = &sd_attrs->ia_iattr; + + if (ia_valid & ATTR_UID) + iattrs->ia_uid = iattr->ia_uid; + if (ia_valid & ATTR_GID) + iattrs->ia_gid = iattr->ia_gid; + if (ia_valid & ATTR_ATIME) + iattrs->ia_atime = timespec_trunc(iattr->ia_atime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MTIME) + iattrs->ia_mtime = timespec_trunc(iattr->ia_mtime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_CTIME) + iattrs->ia_ctime = timespec_trunc(iattr->ia_ctime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MODE) { + umode_t mode = iattr->ia_mode; + + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + mode &= ~S_ISGID; + iattrs->ia_mode = sd->s_mode = mode; + } } + return error; +} - /* attributes were changed atleast once in past */ - - if (ia_valid & ATTR_UID) - sd_iattr->ia_uid = iattr->ia_uid; - if (ia_valid & ATTR_GID) - sd_iattr->ia_gid = iattr->ia_gid; - if (ia_valid & ATTR_ATIME) - sd_iattr->ia_atime = timespec_trunc(iattr->ia_atime, - inode->i_sb->s_time_gran); - if (ia_valid & ATTR_MTIME) - sd_iattr->ia_mtime = timespec_trunc(iattr->ia_mtime, - inode->i_sb->s_time_gran); - if (ia_valid & ATTR_CTIME) - sd_iattr->ia_ctime = timespec_trunc(iattr->ia_ctime, - inode->i_sb->s_time_gran); - if (ia_valid & ATTR_MODE) { - umode_t mode = iattr->ia_mode; - - if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) - mode &= ~S_ISGID; - sd_iattr->ia_mode = sd->s_mode = mode; - } +int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) +{ + struct sysfs_dirent *sd = dentry->d_fsdata; + struct sysfs_inode_attrs *iattrs; + void *secdata; + int error; + u32 secdata_len = 0; + + if (!sd) + return -EINVAL; + if (!sd->s_iattr) + sd->s_iattr = sysfs_init_inode_attrs(sd); + if (!sd->s_iattr) + return -ENOMEM; + + iattrs = sd->s_iattr; + + if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { + const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; + error = security_inode_setsecurity(dentry->d_inode, suffix, + value, size, flags); + if (error) + goto out; + error = security_inode_getsecctx(dentry->d_inode, + &secdata, &secdata_len); + if (error) + goto out; + if (iattrs->ia_secdata) + security_release_secctx(iattrs->ia_secdata, + iattrs->ia_secdata_len); + iattrs->ia_secdata = secdata; + iattrs->ia_secdata_len = secdata_len; + } else + return -EINVAL; +out: return error; } @@ -146,6 +204,7 @@ static int sysfs_count_nlink(struct sysfs_dirent *sd) static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode) { struct bin_attribute *bin_attr; + struct sysfs_inode_attrs *iattrs; inode->i_private = sysfs_get(sd); inode->i_mapping->a_ops = &sysfs_aops; @@ -154,16 +213,20 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode) inode->i_ino = sd->s_ino; lockdep_set_class(&inode->i_mutex, &sysfs_inode_imutex_key); - if (sd->s_iattr) { + iattrs = sd->s_iattr; + if (iattrs) { /* sysfs_dirent has non-default attributes * get them for the new inode from persistent copy * in sysfs_dirent */ - set_inode_attr(inode, sd->s_iattr); + set_inode_attr(inode, &iattrs->ia_iattr); + if (iattrs->ia_secdata) + security_inode_notifysecctx(inode, + iattrs->ia_secdata, + iattrs->ia_secdata_len); } else set_default_inode_attr(inode, sd->s_mode); - /* initialize inode according to type */ switch (sysfs_type(sd)) { case SYSFS_DIR: diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 1d897ad808e0..c5081ad77026 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -16,6 +16,7 @@ #include <linux/kobject.h> #include <linux/namei.h> #include <linux/mutex.h> +#include <linux/security.h> #include "sysfs.h" @@ -209,6 +210,7 @@ static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, void *co } const struct inode_operations sysfs_symlink_inode_operations = { + .setxattr = sysfs_setxattr, .readlink = generic_readlink, .follow_link = sysfs_follow_link, .put_link = sysfs_put_link, diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 3fa0d98481e2..af4c4e7482ac 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -8,6 +8,8 @@ * This file is released under the GPLv2. */ +#include <linux/fs.h> + struct sysfs_open_dirent; /* type-specific structures for sysfs_dirent->s_* union members */ @@ -31,6 +33,12 @@ struct sysfs_elem_bin_attr { struct hlist_head buffers; }; +struct sysfs_inode_attrs { + struct iattr ia_iattr; + void *ia_secdata; + u32 ia_secdata_len; +}; + /* * sysfs_dirent - the building block of sysfs hierarchy. Each and * every sysfs node is represented by single sysfs_dirent. @@ -56,7 +64,7 @@ struct sysfs_dirent { unsigned int s_flags; ino_t s_ino; umode_t s_mode; - struct iattr *s_iattr; + struct sysfs_inode_attrs *s_iattr; }; #define SD_DEACTIVATED_BIAS INT_MIN @@ -148,6 +156,8 @@ static inline void __sysfs_put(struct sysfs_dirent *sd) struct inode *sysfs_get_inode(struct sysfs_dirent *sd); void sysfs_delete_inode(struct inode *inode); int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); +int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags); int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name); int sysfs_inode_init(void); diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index eaf6d891d46f..ee1ce68fd98b 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -54,41 +54,15 @@ * @nr_to_write: how many dirty pages to write-back * * This function shrinks UBIFS liability by means of writing back some amount - * of dirty inodes and their pages. Returns the amount of pages which were - * written back. The returned value does not include dirty inodes which were - * synchronized. + * of dirty inodes and their pages. * * Note, this function synchronizes even VFS inodes which are locked * (@i_mutex) by the caller of the budgeting function, because write-back does * not touch @i_mutex. */ -static int shrink_liability(struct ubifs_info *c, int nr_to_write) +static void shrink_liability(struct ubifs_info *c, int nr_to_write) { - int nr_written; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - .range_end = LLONG_MAX, - .nr_to_write = nr_to_write, - }; - - generic_sync_sb_inodes(c->vfs_sb, &wbc); - nr_written = nr_to_write - wbc.nr_to_write; - - if (!nr_written) { - /* - * Re-try again but wait on pages/inodes which are being - * written-back concurrently (e.g., by pdflush). - */ - memset(&wbc, 0, sizeof(struct writeback_control)); - wbc.sync_mode = WB_SYNC_ALL; - wbc.range_end = LLONG_MAX; - wbc.nr_to_write = nr_to_write; - generic_sync_sb_inodes(c->vfs_sb, &wbc); - nr_written = nr_to_write - wbc.nr_to_write; - } - - dbg_budg("%d pages were written back", nr_written); - return nr_written; + writeback_inodes_sb(c->vfs_sb); } /** diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index bc5857199ec2..762a7d6cec73 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -297,6 +297,7 @@ static enum hrtimer_restart wbuf_timer_callback_nolock(struct hrtimer *timer) { struct ubifs_wbuf *wbuf = container_of(timer, struct ubifs_wbuf, timer); + dbg_io("jhead %d", wbuf->jhead); wbuf->need_sync = 1; wbuf->c->need_wbuf_sync = 1; ubifs_wake_up_bgt(wbuf->c); @@ -311,8 +312,12 @@ static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf) { ubifs_assert(!hrtimer_active(&wbuf->timer)); - if (!ktime_to_ns(wbuf->softlimit)) + if (wbuf->no_timer) return; + dbg_io("set timer for jhead %d, %llu-%llu millisecs", wbuf->jhead, + div_u64(ktime_to_ns(wbuf->softlimit), USEC_PER_SEC), + div_u64(ktime_to_ns(wbuf->softlimit) + wbuf->delta, + USEC_PER_SEC)); hrtimer_start_range_ns(&wbuf->timer, wbuf->softlimit, wbuf->delta, HRTIMER_MODE_REL); } @@ -323,11 +328,8 @@ static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf) */ static void cancel_wbuf_timer_nolock(struct ubifs_wbuf *wbuf) { - /* - * If the syncer is waiting for the lock (from the background thread's - * context) and another task is changing write-buffer then the syncing - * should be canceled. - */ + if (wbuf->no_timer) + return; wbuf->need_sync = 0; hrtimer_cancel(&wbuf->timer); } @@ -349,8 +351,8 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf) /* Write-buffer is empty or not seeked */ return 0; - dbg_io("LEB %d:%d, %d bytes", - wbuf->lnum, wbuf->offs, wbuf->used); + dbg_io("LEB %d:%d, %d bytes, jhead %d", + wbuf->lnum, wbuf->offs, wbuf->used, wbuf->jhead); ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY)); ubifs_assert(!(wbuf->avail & 7)); ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size); @@ -390,7 +392,7 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf) * @offs: logical eraseblock offset to seek to * @dtype: data type * - * This function targets the write buffer to logical eraseblock @lnum:@offs. + * This function targets the write-buffer to logical eraseblock @lnum:@offs. * The write-buffer is synchronized if it is not empty. Returns zero in case of * success and a negative error code in case of failure. */ @@ -399,7 +401,7 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs, { const struct ubifs_info *c = wbuf->c; - dbg_io("LEB %d:%d", lnum, offs); + dbg_io("LEB %d:%d, jhead %d", lnum, offs, wbuf->jhead); ubifs_assert(lnum >= 0 && lnum < c->leb_cnt); ubifs_assert(offs >= 0 && offs <= c->leb_size); ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7)); @@ -506,9 +508,9 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) struct ubifs_info *c = wbuf->c; int err, written, n, aligned_len = ALIGN(len, 8), offs; - dbg_io("%d bytes (%s) to wbuf at LEB %d:%d", len, - dbg_ntype(((struct ubifs_ch *)buf)->node_type), wbuf->lnum, - wbuf->offs + wbuf->used); + dbg_io("%d bytes (%s) to jhead %d wbuf at LEB %d:%d", len, + dbg_ntype(((struct ubifs_ch *)buf)->node_type), wbuf->jhead, + wbuf->lnum, wbuf->offs + wbuf->used); ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt); ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0); ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size); @@ -533,8 +535,8 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) memcpy(wbuf->buf + wbuf->used, buf, len); if (aligned_len == wbuf->avail) { - dbg_io("flush wbuf to LEB %d:%d", wbuf->lnum, - wbuf->offs); + dbg_io("flush jhead %d wbuf to LEB %d:%d", + wbuf->jhead, wbuf->lnum, wbuf->offs); err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, c->min_io_size, wbuf->dtype); @@ -562,7 +564,8 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) * minimal I/O unit. We have to fill and flush write-buffer and switch * to the next min. I/O unit. */ - dbg_io("flush wbuf to LEB %d:%d", wbuf->lnum, wbuf->offs); + dbg_io("flush jhead %d wbuf to LEB %d:%d", + wbuf->jhead, wbuf->lnum, wbuf->offs); memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail); err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, c->min_io_size, wbuf->dtype); @@ -695,7 +698,8 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, int err, rlen, overlap; struct ubifs_ch *ch = buf; - dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len); + dbg_io("LEB %d:%d, %s, length %d, jhead %d", lnum, offs, + dbg_ntype(type), len, wbuf->jhead); ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0); ubifs_assert(!(offs & 7) && offs < c->leb_size); ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT); @@ -819,13 +823,12 @@ out: * @c: UBIFS file-system description object * @wbuf: write-buffer to initialize * - * This function initializes write buffer. Returns zero in case of success + * This function initializes write-buffer. Returns zero in case of success * %-ENOMEM in case of failure. */ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf) { size_t size; - ktime_t hardlimit; wbuf->buf = kmalloc(c->min_io_size, GFP_KERNEL); if (!wbuf->buf) @@ -851,22 +854,16 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf) hrtimer_init(&wbuf->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); wbuf->timer.function = wbuf_timer_callback_nolock; - /* - * Make write-buffer soft limit to be 20% of the hard limit. The - * write-buffer timer is allowed to expire any time between the soft - * and hard limits. - */ - hardlimit = ktime_set(DEFAULT_WBUF_TIMEOUT_SECS, 0); - wbuf->delta = (DEFAULT_WBUF_TIMEOUT_SECS * NSEC_PER_SEC) * 2 / 10; - wbuf->softlimit = ktime_sub_ns(hardlimit, wbuf->delta); - hrtimer_set_expires_range_ns(&wbuf->timer, wbuf->softlimit, - wbuf->delta); + wbuf->softlimit = ktime_set(WBUF_TIMEOUT_SOFTLIMIT, 0); + wbuf->delta = WBUF_TIMEOUT_HARDLIMIT - WBUF_TIMEOUT_SOFTLIMIT; + wbuf->delta *= 1000000000ULL; + ubifs_assert(wbuf->delta <= ULONG_MAX); return 0; } /** * ubifs_wbuf_add_ino_nolock - add an inode number into the wbuf inode array. - * @wbuf: the write-buffer whereto add + * @wbuf: the write-buffer where to add * @inum: the inode number * * This function adds an inode number to the inode array of the write-buffer. diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 6db7a6be6c97..8aacd64957a2 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -25,7 +25,6 @@ /* This file implements EXT2-compatible extended attribute ioctl() calls */ #include <linux/compat.h> -#include <linux/smp_lock.h> #include <linux/mount.h> #include "ubifs.h" diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index 805605250f12..e5f6cf8a1155 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -53,6 +53,25 @@ static int is_empty(void *buf, int len) } /** + * first_non_ff - find offset of the first non-0xff byte. + * @buf: buffer to search in + * @len: length of buffer + * + * This function returns offset of the first non-0xff byte in @buf or %-1 if + * the buffer contains only 0xff bytes. + */ +static int first_non_ff(void *buf, int len) +{ + uint8_t *p = buf; + int i; + + for (i = 0; i < len; i++) + if (*p++ != 0xff) + return i; + return -1; +} + +/** * get_master_node - get the last valid master node allowing for corruption. * @c: UBIFS file-system description object * @lnum: LEB number @@ -357,11 +376,7 @@ static int is_last_write(const struct ubifs_info *c, void *buf, int offs) empty_offs = ALIGN(offs + 1, c->min_io_size); check_len = c->leb_size - empty_offs; p = buf + empty_offs - offs; - - for (; check_len > 0; check_len--) - if (*p++ != 0xff) - return 0; - return 1; + return is_empty(p, check_len); } /** @@ -543,8 +558,8 @@ static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs) * * This function does a scan of a LEB, but caters for errors that might have * been caused by the unclean unmount from which we are attempting to recover. - * - * This function returns %0 on success and a negative error code on failure. + * Returns %0 in case of success, %-EUCLEAN if an unrecoverable corruption is + * found, and a negative error code in case of failure. */ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf, int grouped) @@ -643,7 +658,8 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, goto corrupted; default: dbg_err("unknown"); - goto corrupted; + err = -EINVAL; + goto error; } } @@ -652,8 +668,13 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, clean_buf(c, &buf, lnum, &offs, &len); need_clean = 1; } else { - ubifs_err("corrupt empty space at LEB %d:%d", - lnum, offs); + int corruption = first_non_ff(buf, len); + + ubifs_err("corrupt empty space LEB %d:%d, corruption " + "starts at %d", lnum, offs, corruption); + /* Make sure we dump interesting non-0xFF data */ + offs = corruption; + buf += corruption; goto corrupted; } } @@ -813,7 +834,7 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum, static int recover_head(const struct ubifs_info *c, int lnum, int offs, void *sbuf) { - int len, err, need_clean = 0; + int len, err; if (c->min_io_size > 1) len = c->min_io_size; @@ -827,19 +848,7 @@ static int recover_head(const struct ubifs_info *c, int lnum, int offs, /* Read at the head location and check it is empty flash */ err = ubi_read(c->ubi, lnum, sbuf, offs, len); - if (err) - need_clean = 1; - else { - uint8_t *p = sbuf; - - while (len--) - if (*p++ != 0xff) { - need_clean = 1; - break; - } - } - - if (need_clean) { + if (err || !is_empty(sbuf, len)) { dbg_rcvry("cleaning head at %d:%d", lnum, offs); if (offs == 0) return ubifs_leb_unmap(c, lnum); diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index 11cc80125a49..2970500f32df 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -837,9 +837,10 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf) dbg_mnt("replay log LEB %d:%d", lnum, offs); sleb = ubifs_scan(c, lnum, offs, sbuf); - if (IS_ERR(sleb)) { - if (c->need_recovery) - sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf); + if (IS_ERR(sleb) ) { + if (PTR_ERR(sleb) != -EUCLEAN || !c->need_recovery) + return PTR_ERR(sleb); + sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf); if (IS_ERR(sleb)) return PTR_ERR(sleb); } @@ -957,7 +958,7 @@ out: return err; out_dump: - ubifs_err("log error detected while replying the log at LEB %d:%d", + ubifs_err("log error detected while replaying the log at LEB %d:%d", lnum, offs + snod->offs); dbg_dump_node(c, snod->node); ubifs_scan_destroy(sleb); diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c index 0ed82479b44b..892ebfee4fe5 100644 --- a/fs/ubifs/scan.c +++ b/fs/ubifs/scan.c @@ -238,12 +238,12 @@ void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs, { int len; - ubifs_err("corrupted data at LEB %d:%d", lnum, offs); + ubifs_err("corruption at LEB %d:%d", lnum, offs); if (dbg_failure_mode) return; len = c->leb_size - offs; - if (len > 4096) - len = 4096; + if (len > 8192) + len = 8192; dbg_err("first %d bytes from LEB %d:%d", len, lnum, offs); print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 4, buf, len, 1); } @@ -256,7 +256,9 @@ void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs, * @sbuf: scan buffer (must be c->leb_size) * * This function scans LEB number @lnum and returns complete information about - * its contents. Returns an error code in case of failure. + * its contents. Returns the scaned information in case of success and, + * %-EUCLEAN if the LEB neads recovery, and other negative error codes in case + * of failure. */ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, int offs, void *sbuf) @@ -279,7 +281,6 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, cond_resched(); ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0); - if (ret > 0) { /* Padding bytes or a valid padding node */ offs += ret; @@ -304,7 +305,8 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, goto corrupted; default: dbg_err("unknown"); - goto corrupted; + err = -EINVAL; + goto error; } err = ubifs_add_snod(c, sleb, buf, offs); @@ -317,8 +319,10 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, len -= node_len; } - if (offs % c->min_io_size) - goto corrupted; + if (offs % c->min_io_size) { + ubifs_err("empty space starts at non-aligned offset %d", offs); + goto corrupted;; + } ubifs_end_scan(c, sleb, lnum, offs); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 79fad43f3c57..c4af069df1ad 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -438,12 +438,6 @@ static int ubifs_sync_fs(struct super_block *sb, int wait) { int i, err; struct ubifs_info *c = sb->s_fs_info; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .range_start = 0, - .range_end = LLONG_MAX, - .nr_to_write = LONG_MAX, - }; /* * Zero @wait is just an advisory thing to help the file system shove @@ -462,7 +456,7 @@ static int ubifs_sync_fs(struct super_block *sb, int wait) * the user be able to get more accurate results of 'statfs()' after * they synchronize the file system. */ - generic_sync_sb_inodes(sb, &wbc); + sync_inodes_sb(sb); /* * Synchronize write buffers, because 'ubifs_run_commit()' does not @@ -797,7 +791,7 @@ static int alloc_wbufs(struct ubifs_info *c) * does not need to be synchronized by timer. */ c->jheads[GCHD].wbuf.dtype = UBI_LONGTERM; - c->jheads[GCHD].wbuf.softlimit = ktime_set(0, 0); + c->jheads[GCHD].wbuf.no_timer = 1; return 0; } @@ -986,7 +980,7 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options, switch (token) { /* * %Opt_fast_unmount and %Opt_norm_unmount options are ignored. - * We accepte them in order to be backware-compatible. But this + * We accept them in order to be backward-compatible. But this * should be removed at some point. */ case Opt_fast_unmount: @@ -1287,6 +1281,9 @@ static int mount_ubifs(struct ubifs_info *c) if (err) goto out_journal; + /* Calculate 'min_idx_lebs' after journal replay */ + c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); + err = ubifs_mount_orphans(c, c->need_recovery, mounted_read_only); if (err) goto out_orphans; @@ -1754,10 +1751,8 @@ static void ubifs_put_super(struct super_block *sb) /* Synchronize write-buffers */ if (c->jheads) - for (i = 0; i < c->jhead_cnt; i++) { + for (i = 0; i < c->jhead_cnt; i++) ubifs_wbuf_sync(&c->jheads[i].wbuf); - hrtimer_cancel(&c->jheads[i].wbuf.timer); - } /* * On fatal errors c->ro_media is set to 1, in which case we do @@ -1970,12 +1965,14 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) * * Read-ahead will be disabled because @c->bdi.ra_pages is 0. */ + c->bdi.name = "ubifs", c->bdi.capabilities = BDI_CAP_MAP_COPY; c->bdi.unplug_io_fn = default_unplug_io_fn; err = bdi_init(&c->bdi); if (err) goto out_close; - err = bdi_register(&c->bdi, NULL, "ubifs"); + err = bdi_register(&c->bdi, NULL, "ubifs_%d_%d", + c->vi.ubi_num, c->vi.vol_id); if (err) goto out_bdi; @@ -1983,6 +1980,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) if (err) goto out_bdi; + sb->s_bdi = &c->bdi; sb->s_fs_info = c; sb->s_magic = UBIFS_SUPER_MAGIC; sb->s_blocksize = UBIFS_BLOCK_SIZE; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 1bf01d820066..a29349094422 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -95,8 +95,9 @@ */ #define BGT_NAME_PATTERN "ubifs_bgt%d_%d" -/* Default write-buffer synchronization timeout in seconds */ -#define DEFAULT_WBUF_TIMEOUT_SECS 5 +/* Write-buffer synchronization timeout interval in seconds */ +#define WBUF_TIMEOUT_SOFTLIMIT 3 +#define WBUF_TIMEOUT_HARDLIMIT 5 /* Maximum possible inode number (only 32-bit inodes are supported now) */ #define MAX_INUM 0xFFFFFFFF @@ -654,7 +655,8 @@ typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c, * @delta: hard and soft timeouts delta (the timer expire inteval is @softlimit * and @softlimit + @delta) * @timer: write-buffer timer - * @need_sync: it is set if its timer expired and needs sync + * @no_timer: non-zero if this write-buffer does not have a timer + * @need_sync: non-zero if the timer expired and the wbuf needs sync'ing * @next_ino: points to the next position of the following inode number * @inodes: stores the inode numbers of the nodes which are in wbuf * @@ -683,7 +685,8 @@ struct ubifs_wbuf { ktime_t softlimit; unsigned long long delta; struct hrtimer timer; - int need_sync; + unsigned int no_timer:1; + unsigned int need_sync:1; int next_ino; ino_t *inodes; }; diff --git a/fs/udf/directory.c b/fs/udf/directory.c index 1d2c570704c8..2ffdb6733af1 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -18,59 +18,6 @@ #include <linux/string.h> #include <linux/buffer_head.h> -#if 0 -static uint8_t *udf_filead_read(struct inode *dir, uint8_t *tmpad, - uint8_t ad_size, struct kernel_lb_addr fe_loc, - int *pos, int *offset, struct buffer_head **bh, - int *error) -{ - int loffset = *offset; - int block; - uint8_t *ad; - int remainder; - - *error = 0; - - ad = (uint8_t *)(*bh)->b_data + *offset; - *offset += ad_size; - - if (!ad) { - brelse(*bh); - *error = 1; - return NULL; - } - - if (*offset == dir->i_sb->s_blocksize) { - brelse(*bh); - block = udf_get_lb_pblock(dir->i_sb, fe_loc, ++*pos); - if (!block) - return NULL; - *bh = udf_tread(dir->i_sb, block); - if (!*bh) - return NULL; - } else if (*offset > dir->i_sb->s_blocksize) { - ad = tmpad; - - remainder = dir->i_sb->s_blocksize - loffset; - memcpy((uint8_t *)ad, (*bh)->b_data + loffset, remainder); - - brelse(*bh); - block = udf_get_lb_pblock(dir->i_sb, fe_loc, ++*pos); - if (!block) - return NULL; - (*bh) = udf_tread(dir->i_sb, block); - if (!*bh) - return NULL; - - memcpy((uint8_t *)ad + remainder, (*bh)->b_data, - ad_size - remainder); - *offset = ad_size - remainder; - } - - return ad; -} -#endif - struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, struct udf_fileident_bh *fibh, struct fileIdentDesc *cfi, @@ -248,39 +195,6 @@ struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset) return fi; } -#if 0 -static struct extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset) -{ - struct extent_ad *ext; - struct fileEntry *fe; - uint8_t *ptr; - - if ((!buffer) || (!offset)) { - printk(KERN_ERR "udf: udf_get_fileextent() invalidparms\n"); - return NULL; - } - - fe = (struct fileEntry *)buffer; - - if (fe->descTag.tagIdent != cpu_to_le16(TAG_IDENT_FE)) { - udf_debug("0x%x != TAG_IDENT_FE\n", - le16_to_cpu(fe->descTag.tagIdent)); - return NULL; - } - - ptr = (uint8_t *)(fe->extendedAttr) + - le32_to_cpu(fe->lengthExtendedAttr); - - if ((*offset > 0) && (*offset < le32_to_cpu(fe->lengthAllocDescs))) - ptr += *offset; - - ext = (struct extent_ad *)ptr; - - *offset = *offset + sizeof(struct extent_ad); - return ext; -} -#endif - struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset, int inc) { diff --git a/fs/udf/file.c b/fs/udf/file.c index 7464305382b5..b80cbd78833c 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -193,9 +193,11 @@ int udf_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, static int udf_release_file(struct inode *inode, struct file *filp) { if (filp->f_mode & FMODE_WRITE) { + mutex_lock(&inode->i_mutex); lock_kernel(); udf_discard_prealloc(inode); unlock_kernel(); + mutex_unlock(&inode->i_mutex); } return 0; } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index e7533f785636..6d24c2c63f93 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -90,19 +90,16 @@ no_delete: } /* - * If we are going to release inode from memory, we discard preallocation and - * truncate last inode extent to proper length. We could use drop_inode() but - * it's called under inode_lock and thus we cannot mark inode dirty there. We - * use clear_inode() but we have to make sure to write inode as it's not written - * automatically. + * If we are going to release inode from memory, we truncate last inode extent + * to proper length. We could use drop_inode() but it's called under inode_lock + * and thus we cannot mark inode dirty there. We use clear_inode() but we have + * to make sure to write inode as it's not written automatically. */ void udf_clear_inode(struct inode *inode) { struct udf_inode_info *iinfo; if (!(inode->i_sb->s_flags & MS_RDONLY)) { lock_kernel(); - /* Discard preallocation for directories, symlinks, etc. */ - udf_discard_prealloc(inode); udf_truncate_tail_extent(inode); unlock_kernel(); write_inode_now(inode, 0); @@ -664,8 +661,12 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, udf_split_extents(inode, &c, offset, newblocknum, laarr, &endnum); #ifdef UDF_PREALLOCATE - /* preallocate blocks */ - udf_prealloc_extents(inode, c, lastblock, laarr, &endnum); + /* We preallocate blocks only for regular files. It also makes sense + * for directories but there's a problem when to drop the + * preallocation. We might use some delayed work for that but I feel + * it's overengineering for a filesystem like UDF. */ + if (S_ISREG(inode->i_mode)) + udf_prealloc_extents(inode, c, lastblock, laarr, &endnum); #endif /* merge any continuous blocks in laarr */ diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c index 1b88fd5df05d..43e24a3b8e10 100644 --- a/fs/udf/lowlevel.c +++ b/fs/udf/lowlevel.c @@ -36,14 +36,10 @@ unsigned int udf_get_last_session(struct super_block *sb) ms_info.addr_format = CDROM_LBA; i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long)&ms_info); -#define WE_OBEY_THE_WRITTEN_STANDARDS 1 - if (i == 0) { udf_debug("XA disk: %s, vol_desc_start=%d\n", (ms_info.xa_flag ? "yes" : "no"), ms_info.addr.lba); -#if WE_OBEY_THE_WRITTEN_STANDARDS if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */ -#endif vol_desc_start = ms_info.addr.lba; } else { udf_debug("CDROMMULTISESSION not supported: rc=%d\n", i); diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 6a29fa34c478..21dad8c608f9 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -943,7 +943,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, pc->componentType = 1; pc->lengthComponentIdent = 0; pc->componentFileVersionNum = 0; - pc += sizeof(struct pathComponent); elen += sizeof(struct pathComponent); } diff --git a/fs/udf/super.c b/fs/udf/super.c index 6832135159b6..9d1b8c2e6c45 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -1087,11 +1087,23 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) struct udf_inode_info *vati; uint32_t pos; struct virtualAllocationTable20 *vat20; + sector_t blocks = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits; /* VAT file entry is in the last recorded block */ ino.partitionReferenceNum = type1_index; ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root; sbi->s_vat_inode = udf_iget(sb, &ino); + if (!sbi->s_vat_inode && + sbi->s_last_block != blocks - 1) { + printk(KERN_NOTICE "UDF-fs: Failed to read VAT inode from the" + " last recorded block (%lu), retrying with the last " + "block of the device (%lu).\n", + (unsigned long)sbi->s_last_block, + (unsigned long)blocks - 1); + ino.partitionReferenceNum = type1_index; + ino.logicalBlockNum = blocks - 1 - map->s_partition_root; + sbi->s_vat_inode = udf_iget(sb, &ino); + } if (!sbi->s_vat_inode) return 1; diff --git a/fs/xattr.c b/fs/xattr.c index 1c3d0af59ddf..6d4f6d3449fb 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -66,22 +66,28 @@ xattr_permission(struct inode *inode, const char *name, int mask) return inode_permission(inode, mask); } -int -vfs_setxattr(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags) +/** + * __vfs_setxattr_noperm - perform setxattr operation without performing + * permission checks. + * + * @dentry - object to perform setxattr on + * @name - xattr name to set + * @value - value to set @name to + * @size - size of @value + * @flags - flags to pass into filesystem operations + * + * returns the result of the internal setxattr or setsecurity operations. + * + * This function requires the caller to lock the inode's i_mutex before it + * is executed. It also assumes that the caller will make the appropriate + * permission checks. + */ +int __vfs_setxattr_noperm(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; - int error; - - error = xattr_permission(inode, name, MAY_WRITE); - if (error) - return error; + int error = -EOPNOTSUPP; - mutex_lock(&inode->i_mutex); - error = security_inode_setxattr(dentry, name, value, size, flags); - if (error) - goto out; - error = -EOPNOTSUPP; if (inode->i_op->setxattr) { error = inode->i_op->setxattr(dentry, name, value, size, flags); if (!error) { @@ -97,6 +103,29 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value, if (!error) fsnotify_xattr(dentry); } + + return error; +} + + +int +vfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) +{ + struct inode *inode = dentry->d_inode; + int error; + + error = xattr_permission(inode, name, MAY_WRITE); + if (error) + return error; + + mutex_lock(&inode->i_mutex); + error = security_inode_setxattr(dentry, name, value, size, flags); + if (error) + goto out; + + error = __vfs_setxattr_noperm(dentry, name, value, size, flags); + out: mutex_unlock(&inode->i_mutex); return error; diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c index 1cd3b55ee3d2..2d3f90afe5f1 100644 --- a/fs/xfs/linux-2.6/kmem.c +++ b/fs/xfs/linux-2.6/kmem.c @@ -53,7 +53,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags) printk(KERN_ERR "XFS: possible memory allocation " "deadlock in %s (mode:0x%x)\n", __func__, lflags); - congestion_wait(WRITE, HZ/50); + congestion_wait(BLK_RW_ASYNC, HZ/50); } while (1); } @@ -130,7 +130,7 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags) printk(KERN_ERR "XFS: possible memory allocation " "deadlock in %s (mode:0x%x)\n", __func__, lflags); - congestion_wait(WRITE, HZ/50); + congestion_wait(BLK_RW_ASYNC, HZ/50); } while (1); } diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 7ec89fc05b2b..d5e5559e31db 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -216,7 +216,6 @@ xfs_setfilesize( if (ip->i_d.di_size < isize) { ip->i_d.di_size = isize; ip->i_update_core = 1; - ip->i_update_size = 1; xfs_mark_inode_dirty_sync(ip); } @@ -1268,6 +1267,14 @@ xfs_vm_writepage( if (!page_has_buffers(page)) create_empty_buffers(page, 1 << inode->i_blkbits, 0); + + /* + * VM calculation for nr_to_write seems off. Bump it way + * up, this gets simple streaming writes zippy again. + * To be reviewed again after Jens' writeback changes. + */ + wbc->nr_to_write *= 4; + /* * Convert delayed allocate, unwritten or unmapped space * to real space and flush out to disk. diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 1418b916fc27..965df1227d64 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -412,7 +412,7 @@ _xfs_buf_lookup_pages( XFS_STATS_INC(xb_page_retries); xfsbufd_wakeup(0, gfp_mask); - congestion_wait(WRITE, HZ/50); + congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry; } @@ -770,7 +770,7 @@ xfs_buf_associate_memory( bp->b_pages = NULL; bp->b_addr = mem; - rval = _xfs_buf_get_pages(bp, page_count, 0); + rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK); if (rval) return rval; diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index f4e255441574..988d8f87bc0f 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -41,7 +41,6 @@ #include "xfs_ioctl.h" #include <linux/dcache.h> -#include <linux/smp_lock.h> static struct vm_operations_struct xfs_file_vm_ops; @@ -173,12 +172,21 @@ xfs_file_release( */ STATIC int xfs_file_fsync( - struct file *filp, - struct dentry *dentry, - int datasync) + struct file *file, + struct dentry *dentry, + int datasync) { - xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED); - return -xfs_fsync(XFS_I(dentry->d_inode)); + struct inode *inode = dentry->d_inode; + struct xfs_inode *ip = XFS_I(inode); + int error; + + /* capture size updates in I/O completion before writing the inode. */ + error = filemap_fdatawait(inode->i_mapping); + if (error) + return error; + + xfs_iflags_clear(ip, XFS_ITRUNCATED); + return -xfs_fsync(ip); } STATIC int diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c index 0882d166239a..eafcc7c18706 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c @@ -619,7 +619,7 @@ xfs_file_compat_ioctl( case XFS_IOC_GETVERSION_32: cmd = _NATIVE_IOC(cmd, long); return xfs_file_ioctl(filp, cmd, p); - case XFS_IOC_SWAPEXT: { + case XFS_IOC_SWAPEXT_32: { struct xfs_swapext sxp; struct compat_xfs_swapext __user *sxu = arg; diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 58973bb46038..da0159d99f82 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -43,7 +43,6 @@ #include "xfs_error.h" #include "xfs_itable.h" #include "xfs_rw.h" -#include "xfs_acl.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_utils.h" @@ -485,14 +484,6 @@ xfs_vn_put_link( } STATIC int -xfs_vn_permission( - struct inode *inode, - int mask) -{ - return generic_permission(inode, mask, xfs_check_acl); -} - -STATIC int xfs_vn_getattr( struct vfsmount *mnt, struct dentry *dentry, @@ -680,8 +671,8 @@ xfs_vn_fiemap( else bm.bmv_length = BTOBB(length); - /* our formatter will tell xfs_getbmap when to stop. */ - bm.bmv_count = MAXEXTNUM; + /* We add one because in getbmap world count includes the header */ + bm.bmv_count = fieinfo->fi_extents_max + 1; bm.bmv_iflags = BMV_IF_PREALLOC; if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) bm.bmv_iflags |= BMV_IF_ATTRFORK; @@ -696,7 +687,7 @@ xfs_vn_fiemap( } static const struct inode_operations xfs_inode_operations = { - .permission = xfs_vn_permission, + .check_acl = xfs_check_acl, .truncate = xfs_vn_truncate, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, @@ -724,7 +715,7 @@ static const struct inode_operations xfs_dir_inode_operations = { .rmdir = xfs_vn_unlink, .mknod = xfs_vn_mknod, .rename = xfs_vn_rename, - .permission = xfs_vn_permission, + .check_acl = xfs_check_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -749,7 +740,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = { .rmdir = xfs_vn_unlink, .mknod = xfs_vn_mknod, .rename = xfs_vn_rename, - .permission = xfs_vn_permission, + .check_acl = xfs_check_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -762,7 +753,7 @@ static const struct inode_operations xfs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = xfs_vn_follow_link, .put_link = xfs_vn_put_link, - .permission = xfs_vn_permission, + .check_acl = xfs_check_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 7078974a6eee..49e4a6aea73c 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -812,18 +812,21 @@ write_retry: /* Handle various SYNC-type writes */ if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { + loff_t end = pos + ret - 1; int error2; xfs_iunlock(xip, iolock); if (need_i_mutex) mutex_unlock(&inode->i_mutex); - error2 = sync_page_range(inode, mapping, pos, ret); + + error2 = filemap_write_and_wait_range(mapping, pos, end); if (!error) error = error2; if (need_i_mutex) mutex_lock(&inode->i_mutex); xfs_ilock(xip, iolock); - error2 = xfs_write_sync_logforce(mp, xip); + + error2 = xfs_fsync(xip); if (!error) error = error2; } diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c index c3526d445f6a..76fdc5861932 100644 --- a/fs/xfs/linux-2.6/xfs_stats.c +++ b/fs/xfs/linux-2.6/xfs_stats.c @@ -20,16 +20,9 @@ DEFINE_PER_CPU(struct xfsstats, xfsstats); -STATIC int -xfs_read_xfsstats( - char *buffer, - char **start, - off_t offset, - int count, - int *eof, - void *data) +static int xfs_stat_proc_show(struct seq_file *m, void *v) { - int c, i, j, len, val; + int c, i, j, val; __uint64_t xs_xstrat_bytes = 0; __uint64_t xs_write_bytes = 0; __uint64_t xs_read_bytes = 0; @@ -60,18 +53,18 @@ xfs_read_xfsstats( }; /* Loop over all stats groups */ - for (i=j=len = 0; i < ARRAY_SIZE(xstats); i++) { - len += sprintf(buffer + len, "%s", xstats[i].desc); + for (i=j = 0; i < ARRAY_SIZE(xstats); i++) { + seq_printf(m, "%s", xstats[i].desc); /* inner loop does each group */ while (j < xstats[i].endpoint) { val = 0; /* sum over all cpus */ for_each_possible_cpu(c) val += *(((__u32*)&per_cpu(xfsstats, c) + j)); - len += sprintf(buffer + len, " %u", val); + seq_printf(m, " %u", val); j++; } - buffer[len++] = '\n'; + seq_putc(m, '\n'); } /* extra precision counters */ for_each_possible_cpu(i) { @@ -80,36 +73,38 @@ xfs_read_xfsstats( xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes; } - len += sprintf(buffer + len, "xpc %Lu %Lu %Lu\n", + seq_printf(m, "xpc %Lu %Lu %Lu\n", xs_xstrat_bytes, xs_write_bytes, xs_read_bytes); - len += sprintf(buffer + len, "debug %u\n", + seq_printf(m, "debug %u\n", #if defined(DEBUG) 1); #else 0); #endif + return 0; +} - if (offset >= len) { - *start = buffer; - *eof = 1; - return 0; - } - *start = buffer + offset; - if ((len -= offset) > count) - return count; - *eof = 1; - - return len; +static int xfs_stat_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, xfs_stat_proc_show, NULL); } +static const struct file_operations xfs_stat_proc_fops = { + .owner = THIS_MODULE, + .open = xfs_stat_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + int xfs_init_procfs(void) { if (!proc_mkdir("fs/xfs", NULL)) goto out; - if (!create_proc_read_entry("fs/xfs/stat", 0, NULL, - xfs_read_xfsstats, NULL)) + if (!proc_create("fs/xfs/stat", 0, NULL, + &xfs_stat_proc_fops)) goto out_remove_entry; return 0; diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index a220d36f789b..5d7c60ac77b4 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -579,15 +579,19 @@ xfs_showargs( else if (mp->m_qflags & XFS_UQUOTA_ACCT) seq_puts(m, "," MNTOPT_UQUOTANOENF); - if (mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD)) - seq_puts(m, "," MNTOPT_PRJQUOTA); - else if (mp->m_qflags & XFS_PQUOTA_ACCT) - seq_puts(m, "," MNTOPT_PQUOTANOENF); - - if (mp->m_qflags & (XFS_GQUOTA_ACCT|XFS_OQUOTA_ENFD)) - seq_puts(m, "," MNTOPT_GRPQUOTA); - else if (mp->m_qflags & XFS_GQUOTA_ACCT) - seq_puts(m, "," MNTOPT_GQUOTANOENF); + /* Either project or group quotas can be active, not both */ + + if (mp->m_qflags & XFS_PQUOTA_ACCT) { + if (mp->m_qflags & XFS_OQUOTA_ENFD) + seq_puts(m, "," MNTOPT_PRJQUOTA); + else + seq_puts(m, "," MNTOPT_PQUOTANOENF); + } else if (mp->m_qflags & XFS_GQUOTA_ACCT) { + if (mp->m_qflags & XFS_OQUOTA_ENFD) + seq_puts(m, "," MNTOPT_GRPQUOTA); + else + seq_puts(m, "," MNTOPT_GQUOTANOENF); + } if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT)) seq_puts(m, "," MNTOPT_NOQUOTA); @@ -687,7 +691,7 @@ xfs_barrier_test( return error; } -void +STATIC void xfs_mountfs_check_barriers(xfs_mount_t *mp) { int error; diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index b619d6b8ca43..320be6aea492 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -708,6 +708,16 @@ xfs_reclaim_inode( return 0; } +void +__xfs_inode_set_reclaim_tag( + struct xfs_perag *pag, + struct xfs_inode *ip) +{ + radix_tree_tag_set(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), + XFS_ICI_RECLAIM_TAG); +} + /* * We set the inode flag atomically with the radix tree tag. * Once we get tag lookups on the radix tree, this inode flag @@ -722,8 +732,7 @@ xfs_inode_set_reclaim_tag( read_lock(&pag->pag_ici_lock); spin_lock(&ip->i_flags_lock); - radix_tree_tag_set(&pag->pag_ici_root, - XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); + __xfs_inode_set_reclaim_tag(pag, ip); __xfs_iflags_set(ip, XFS_IRECLAIMABLE); spin_unlock(&ip->i_flags_lock); read_unlock(&pag->pag_ici_lock); @@ -740,21 +749,6 @@ __xfs_inode_clear_reclaim_tag( XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); } -void -xfs_inode_clear_reclaim_tag( - xfs_inode_t *ip) -{ - xfs_mount_t *mp = ip->i_mount; - xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino); - - read_lock(&pag->pag_ici_lock); - spin_lock(&ip->i_flags_lock); - __xfs_inode_clear_reclaim_tag(mp, pag, ip); - spin_unlock(&ip->i_flags_lock); - read_unlock(&pag->pag_ici_lock); - xfs_put_perag(mp, pag); -} - STATIC int xfs_reclaim_inode_now( struct xfs_inode *ip, diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index 2a10301c99c7..27920eb7a820 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -48,7 +48,7 @@ int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode); int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); -void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip); +void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip); void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, struct xfs_inode *ip); diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c index 21b08c0396a1..83e7ea3e25fa 100644 --- a/fs/xfs/quota/xfs_qm_stats.c +++ b/fs/xfs/quota/xfs_qm_stats.c @@ -48,50 +48,34 @@ struct xqmstats xqmstats; -STATIC int -xfs_qm_read_xfsquota( - char *buffer, - char **start, - off_t offset, - int count, - int *eof, - void *data) +static int xqm_proc_show(struct seq_file *m, void *v) { - int len; - /* maximum; incore; ratio free to inuse; freelist */ - len = sprintf(buffer, "%d\t%d\t%d\t%u\n", + seq_printf(m, "%d\t%d\t%d\t%u\n", ndquot, xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0, xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0, xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0); - - if (offset >= len) { - *start = buffer; - *eof = 1; - return 0; - } - *start = buffer + offset; - if ((len -= offset) > count) - return count; - *eof = 1; - - return len; + return 0; } -STATIC int -xfs_qm_read_stats( - char *buffer, - char **start, - off_t offset, - int count, - int *eof, - void *data) +static int xqm_proc_open(struct inode *inode, struct file *file) { - int len; + return single_open(file, xqm_proc_show, NULL); +} + +static const struct file_operations xqm_proc_fops = { + .owner = THIS_MODULE, + .open = xqm_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +static int xqmstat_proc_show(struct seq_file *m, void *v) +{ /* quota performance statistics */ - len = sprintf(buffer, "qm %u %u %u %u %u %u %u %u\n", + seq_printf(m, "qm %u %u %u %u %u %u %u %u\n", xqmstats.xs_qm_dqreclaims, xqmstats.xs_qm_dqreclaim_misses, xqmstats.xs_qm_dquot_dups, @@ -100,25 +84,27 @@ xfs_qm_read_stats( xqmstats.xs_qm_dqwants, xqmstats.xs_qm_dqshake_reclaims, xqmstats.xs_qm_dqinact_reclaims); + return 0; +} - if (offset >= len) { - *start = buffer; - *eof = 1; - return 0; - } - *start = buffer + offset; - if ((len -= offset) > count) - return count; - *eof = 1; - - return len; +static int xqmstat_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, xqmstat_proc_show, NULL); } +static const struct file_operations xqmstat_proc_fops = { + .owner = THIS_MODULE, + .open = xqmstat_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + void xfs_qm_init_procfs(void) { - create_proc_read_entry("fs/xfs/xqmstat", 0, NULL, xfs_qm_read_stats, NULL); - create_proc_read_entry("fs/xfs/xqm", 0, NULL, xfs_qm_read_xfsquota, NULL); + proc_create("fs/xfs/xqmstat", 0, NULL, &xqmstat_proc_fops); + proc_create("fs/xfs/xqm", 0, NULL, &xqm_proc_fops); } void diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index f24b50b68d03..a5d54bf4931b 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -198,6 +198,15 @@ typedef struct xfs_perag xfs_agino_t pagi_count; /* number of allocated inodes */ int pagb_count; /* pagb slots in use */ xfs_perag_busy_t *pagb_list; /* unstable blocks */ + + /* + * Inode allocation search lookup optimisation. + * If the pagino matches, the search for new inodes + * doesn't need to search the near ones again straight away + */ + xfs_agino_t pagl_pagino; + xfs_agino_t pagl_leftrec; + xfs_agino_t pagl_rightrec; #ifdef __KERNEL__ spinlock_t pagb_lock; /* lock for pagb_list */ diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index db15feb906ff..4ece1906bd41 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -2010,7 +2010,9 @@ xfs_attr_rmtval_get(xfs_da_args_t *args) dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno, - blkcnt, XFS_BUF_LOCK, &bp); + blkcnt, + XFS_BUF_LOCK | XBF_DONT_BLOCK, + &bp); if (error) return(error); @@ -2141,8 +2143,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno, - blkcnt, XFS_BUF_LOCK); + bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno, blkcnt, + XFS_BUF_LOCK | XBF_DONT_BLOCK); ASSERT(bp); ASSERT(!XFS_BUF_GETERROR(bp)); diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 7928b9983c1d..8971fb09d387 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -3713,7 +3713,7 @@ done: * entry (null if none). Else, *lastxp will be set to the index * of the found entry; *gotp will contain the entry. */ -xfs_bmbt_rec_host_t * /* pointer to found extent entry */ +STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ xfs_bmap_search_multi_extents( xfs_ifork_t *ifp, /* inode fork pointer */ xfs_fileoff_t bno, /* block number searched for */ @@ -6009,7 +6009,7 @@ xfs_getbmap( */ error = ENOMEM; subnex = 16; - map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL); + map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS); if (!map) goto out_unlock_ilock; diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 1b8ff9256bd0..56f62d2edc35 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -392,17 +392,6 @@ xfs_bmap_count_blocks( int whichfork, int *count); -/* - * Search the extent records for the entry containing block bno. - * If bno lies in a hole, point to the next entry. If bno lies - * past eof, *eofp will be set, and *prevp will contain the last - * entry (null if none). Else, *lastxp will be set to the index - * of the found entry; *gotp will contain the entry. - */ -xfs_bmbt_rec_host_t * -xfs_bmap_search_multi_extents(struct xfs_ifork *, xfs_fileoff_t, int *, - xfs_extnum_t *, xfs_bmbt_irec_t *, xfs_bmbt_irec_t *); - #endif /* __KERNEL__ */ #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 5c1ade06578e..eb7b702d0690 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -202,16 +202,6 @@ xfs_bmbt_get_state( ext_flag); } -/* Endian flipping versions of the bmbt extraction functions */ -void -xfs_bmbt_disk_get_all( - xfs_bmbt_rec_t *r, - xfs_bmbt_irec_t *s) -{ - __xfs_bmbt_get_all(get_unaligned_be64(&r->l0), - get_unaligned_be64(&r->l1), s); -} - /* * Extract the blockcount field from an on disk bmap extent record. */ @@ -816,6 +806,16 @@ xfs_bmbt_trace_key( *l1 = 0; } +/* Endian flipping versions of the bmbt extraction functions */ +STATIC void +xfs_bmbt_disk_get_all( + xfs_bmbt_rec_t *r, + xfs_bmbt_irec_t *s) +{ + __xfs_bmbt_get_all(get_unaligned_be64(&r->l0), + get_unaligned_be64(&r->l1), s); +} + STATIC void xfs_bmbt_trace_record( struct xfs_btree_cur *cur, diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index 0e8df007615e..5549d495947f 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -220,7 +220,6 @@ extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r); extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r); extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r); -extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s); extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r); extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r); diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index e9df99574829..52b5f14d0c32 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -120,8 +120,8 @@ xfs_btree_check_sblock( XFS_RANDOM_BTREE_CHECK_SBLOCK))) { if (bp) xfs_buftrace("SBTREE ERROR", bp); - XFS_ERROR_REPORT("xfs_btree_check_sblock", XFS_ERRLEVEL_LOW, - cur->bc_mp); + XFS_CORRUPTION_ERROR("xfs_btree_check_sblock", + XFS_ERRLEVEL_LOW, cur->bc_mp, block); return XFS_ERROR(EFSCORRUPTED); } return 0; @@ -646,46 +646,6 @@ xfs_btree_read_bufl( } /* - * Get a buffer for the block, return it read in. - * Short-form addressing. - */ -int /* error */ -xfs_btree_read_bufs( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t agbno, /* allocation group block number */ - uint lock, /* lock flags for read_buf */ - xfs_buf_t **bpp, /* buffer for agno/agbno */ - int refval) /* ref count value for buffer */ -{ - xfs_buf_t *bp; /* return value */ - xfs_daddr_t d; /* real disk block address */ - int error; - - ASSERT(agno != NULLAGNUMBER); - ASSERT(agbno != NULLAGBLOCK); - d = XFS_AGB_TO_DADDR(mp, agno, agbno); - if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, - mp->m_bsize, lock, &bp))) { - return error; - } - ASSERT(!bp || !XFS_BUF_GETERROR(bp)); - if (bp != NULL) { - switch (refval) { - case XFS_ALLOC_BTREE_REF: - XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval); - break; - case XFS_INO_BTREE_REF: - XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, refval); - break; - } - } - *bpp = bp; - return 0; -} - -/* * Read-ahead the block, don't wait for it, don't return a buffer. * Long-form addressing. */ @@ -2951,7 +2911,7 @@ error0: * inode we have to copy the single block it was pointing to into the * inode. */ -int +STATIC int xfs_btree_kill_iroot( struct xfs_btree_cur *cur) { diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index 4f852b735b96..7fa07062bdda 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -379,20 +379,6 @@ xfs_btree_read_bufl( int refval);/* ref count value for buffer */ /* - * Get a buffer for the block, return it read in. - * Short-form addressing. - */ -int /* error */ -xfs_btree_read_bufs( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t agbno, /* allocation group block number */ - uint lock, /* lock flags for read_buf */ - struct xfs_buf **bpp, /* buffer for agno/agbno */ - int refval);/* ref count value for buffer */ - -/* * Read-ahead the block, don't wait for it, don't return a buffer. * Long-form addressing. */ @@ -432,7 +418,6 @@ int xfs_btree_decrement(struct xfs_btree_cur *, int, int *); int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *); int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *); int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *); -int xfs_btree_kill_iroot(struct xfs_btree_cur *); int xfs_btree_insert(struct xfs_btree_cur *, int *); int xfs_btree_delete(struct xfs_btree_cur *, int *); int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *); diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 9ff6e57a5075..2847bbc1c534 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -2201,7 +2201,7 @@ kmem_zone_t *xfs_dabuf_zone; /* dabuf zone */ xfs_da_state_t * xfs_da_state_alloc(void) { - return kmem_zone_zalloc(xfs_da_state_zone, KM_SLEEP); + return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS); } /* @@ -2261,9 +2261,9 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra) int off; if (nbuf == 1) - dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_SLEEP); + dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_NOFS); else - dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_SLEEP); + dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_NOFS); dabuf->dirty = 0; #ifdef XFS_DABUF_DEBUG dabuf->ra = ra; diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index c657bec6d951..bb1d58eb3982 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -256,7 +256,7 @@ xfs_dir_cilookup_result( !(args->op_flags & XFS_DA_OP_CILOOKUP)) return EEXIST; - args->value = kmem_alloc(len, KM_MAYFAIL); + args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL); if (!args->value) return ENOMEM; diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index cbd451bb4848..2d0b3e1da9e6 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -167,17 +167,25 @@ xfs_growfs_data_private( new = nb - mp->m_sb.sb_dblocks; oagcount = mp->m_sb.sb_agcount; if (nagcount > oagcount) { + void *new_perag, *old_perag; + xfs_filestream_flush(mp); + + new_perag = kmem_zalloc(sizeof(xfs_perag_t) * nagcount, + KM_MAYFAIL); + if (!new_perag) + return XFS_ERROR(ENOMEM); + down_write(&mp->m_peraglock); - mp->m_perag = kmem_realloc(mp->m_perag, - sizeof(xfs_perag_t) * nagcount, - sizeof(xfs_perag_t) * oagcount, - KM_SLEEP); - memset(&mp->m_perag[oagcount], 0, - (nagcount - oagcount) * sizeof(xfs_perag_t)); + memcpy(new_perag, mp->m_perag, sizeof(xfs_perag_t) * oagcount); + old_perag = mp->m_perag; + mp->m_perag = new_perag; + mp->m_flags |= XFS_MOUNT_32BITINODES; nagimax = xfs_initialize_perag(mp, nagcount); up_write(&mp->m_peraglock); + + kmem_free(old_perag); } tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS); tp->t_flags |= XFS_TRANS_RESERVE; diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 3120a3a5e20f..ab64f3efb43b 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -57,75 +57,35 @@ xfs_ialloc_cluster_alignment( } /* - * Lookup the record equal to ino in the btree given by cur. - */ -STATIC int /* error */ -xfs_inobt_lookup_eq( - struct xfs_btree_cur *cur, /* btree cursor */ - xfs_agino_t ino, /* starting inode of chunk */ - __int32_t fcnt, /* free inode count */ - xfs_inofree_t free, /* free inode mask */ - int *stat) /* success/failure */ -{ - cur->bc_rec.i.ir_startino = ino; - cur->bc_rec.i.ir_freecount = fcnt; - cur->bc_rec.i.ir_free = free; - return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); -} - -/* - * Lookup the first record greater than or equal to ino - * in the btree given by cur. + * Lookup a record by ino in the btree given by cur. */ int /* error */ -xfs_inobt_lookup_ge( +xfs_inobt_lookup( struct xfs_btree_cur *cur, /* btree cursor */ xfs_agino_t ino, /* starting inode of chunk */ - __int32_t fcnt, /* free inode count */ - xfs_inofree_t free, /* free inode mask */ + xfs_lookup_t dir, /* <=, >=, == */ int *stat) /* success/failure */ { cur->bc_rec.i.ir_startino = ino; - cur->bc_rec.i.ir_freecount = fcnt; - cur->bc_rec.i.ir_free = free; - return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); + cur->bc_rec.i.ir_freecount = 0; + cur->bc_rec.i.ir_free = 0; + return xfs_btree_lookup(cur, dir, stat); } /* - * Lookup the first record less than or equal to ino - * in the btree given by cur. - */ -int /* error */ -xfs_inobt_lookup_le( - struct xfs_btree_cur *cur, /* btree cursor */ - xfs_agino_t ino, /* starting inode of chunk */ - __int32_t fcnt, /* free inode count */ - xfs_inofree_t free, /* free inode mask */ - int *stat) /* success/failure */ -{ - cur->bc_rec.i.ir_startino = ino; - cur->bc_rec.i.ir_freecount = fcnt; - cur->bc_rec.i.ir_free = free; - return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); -} - -/* - * Update the record referred to by cur to the value given - * by [ino, fcnt, free]. + * Update the record referred to by cur to the value given. * This either works (return 0) or gets an EFSCORRUPTED error. */ STATIC int /* error */ xfs_inobt_update( struct xfs_btree_cur *cur, /* btree cursor */ - xfs_agino_t ino, /* starting inode of chunk */ - __int32_t fcnt, /* free inode count */ - xfs_inofree_t free) /* free inode mask */ + xfs_inobt_rec_incore_t *irec) /* btree record */ { union xfs_btree_rec rec; - rec.inobt.ir_startino = cpu_to_be32(ino); - rec.inobt.ir_freecount = cpu_to_be32(fcnt); - rec.inobt.ir_free = cpu_to_be64(free); + rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino); + rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount); + rec.inobt.ir_free = cpu_to_be64(irec->ir_free); return xfs_btree_update(cur, &rec); } @@ -135,9 +95,7 @@ xfs_inobt_update( int /* error */ xfs_inobt_get_rec( struct xfs_btree_cur *cur, /* btree cursor */ - xfs_agino_t *ino, /* output: starting inode of chunk */ - __int32_t *fcnt, /* output: number of free inodes */ - xfs_inofree_t *free, /* output: free inode mask */ + xfs_inobt_rec_incore_t *irec, /* btree record */ int *stat) /* output: success/failure */ { union xfs_btree_rec *rec; @@ -145,14 +103,136 @@ xfs_inobt_get_rec( error = xfs_btree_get_rec(cur, &rec, stat); if (!error && *stat == 1) { - *ino = be32_to_cpu(rec->inobt.ir_startino); - *fcnt = be32_to_cpu(rec->inobt.ir_freecount); - *free = be64_to_cpu(rec->inobt.ir_free); + irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino); + irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount); + irec->ir_free = be64_to_cpu(rec->inobt.ir_free); } return error; } /* + * Verify that the number of free inodes in the AGI is correct. + */ +#ifdef DEBUG +STATIC int +xfs_check_agi_freecount( + struct xfs_btree_cur *cur, + struct xfs_agi *agi) +{ + if (cur->bc_nlevels == 1) { + xfs_inobt_rec_incore_t rec; + int freecount = 0; + int error; + int i; + + error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); + if (error) + return error; + + do { + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + return error; + + if (i) { + freecount += rec.ir_freecount; + error = xfs_btree_increment(cur, 0, &i); + if (error) + return error; + } + } while (i == 1); + + if (!XFS_FORCED_SHUTDOWN(cur->bc_mp)) + ASSERT(freecount == be32_to_cpu(agi->agi_freecount)); + } + return 0; +} +#else +#define xfs_check_agi_freecount(cur, agi) 0 +#endif + +/* + * Initialise a new set of inodes. + */ +STATIC void +xfs_ialloc_inode_init( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agblock_t agbno, + xfs_agblock_t length, + unsigned int gen) +{ + struct xfs_buf *fbuf; + struct xfs_dinode *free; + int blks_per_cluster, nbufs, ninodes; + int version; + int i, j; + xfs_daddr_t d; + + /* + * Loop over the new block(s), filling in the inodes. + * For small block sizes, manipulate the inodes in buffers + * which are multiples of the blocks size. + */ + if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { + blks_per_cluster = 1; + nbufs = length; + ninodes = mp->m_sb.sb_inopblock; + } else { + blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) / + mp->m_sb.sb_blocksize; + nbufs = length / blks_per_cluster; + ninodes = blks_per_cluster * mp->m_sb.sb_inopblock; + } + + /* + * Figure out what version number to use in the inodes we create. + * If the superblock version has caught up to the one that supports + * the new inode format, then use the new inode version. Otherwise + * use the old version so that old kernels will continue to be + * able to use the file system. + */ + if (xfs_sb_version_hasnlink(&mp->m_sb)) + version = 2; + else + version = 1; + + for (j = 0; j < nbufs; j++) { + /* + * Get the block. + */ + d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster)); + fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, + mp->m_bsize * blks_per_cluster, + XFS_BUF_LOCK); + ASSERT(fbuf); + ASSERT(!XFS_BUF_GETERROR(fbuf)); + + /* + * Initialize all inodes in this buffer and then log them. + * + * XXX: It would be much better if we had just one transaction + * to log a whole cluster of inodes instead of all the + * individual transactions causing a lot of log traffic. + */ + xfs_biozero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog); + for (i = 0; i < ninodes; i++) { + int ioffset = i << mp->m_sb.sb_inodelog; + uint isize = sizeof(struct xfs_dinode); + + free = xfs_make_iptr(mp, fbuf, i); + free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); + free->di_version = version; + free->di_gen = cpu_to_be32(gen); + free->di_next_unlinked = cpu_to_be32(NULLAGINO); + xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1); + } + xfs_trans_inode_alloc_buf(tp, fbuf); + } +} + +/* * Allocate new inodes in the allocation group specified by agbp. * Return 0 for success, else error code. */ @@ -164,24 +244,15 @@ xfs_ialloc_ag_alloc( { xfs_agi_t *agi; /* allocation group header */ xfs_alloc_arg_t args; /* allocation argument structure */ - int blks_per_cluster; /* fs blocks per inode cluster */ xfs_btree_cur_t *cur; /* inode btree cursor */ - xfs_daddr_t d; /* disk addr of buffer */ xfs_agnumber_t agno; int error; - xfs_buf_t *fbuf; /* new free inodes' buffer */ - xfs_dinode_t *free; /* new free inode structure */ - int i; /* inode counter */ - int j; /* block counter */ - int nbufs; /* num bufs of new inodes */ + int i; xfs_agino_t newino; /* new first inode's number */ xfs_agino_t newlen; /* new number of inodes */ - int ninodes; /* num inodes per buf */ xfs_agino_t thisino; /* current inode number, for loop */ - int version; /* inode version number to use */ int isaligned = 0; /* inode allocation at stripe unit */ /* boundary */ - unsigned int gen; args.tp = tp; args.mp = tp->t_mountp; @@ -202,12 +273,12 @@ xfs_ialloc_ag_alloc( */ agi = XFS_BUF_TO_AGI(agbp); newino = be32_to_cpu(agi->agi_newino); + agno = be32_to_cpu(agi->agi_seqno); args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + XFS_IALLOC_BLOCKS(args.mp); if (likely(newino != NULLAGINO && (args.agbno < be32_to_cpu(agi->agi_length)))) { - args.fsbno = XFS_AGB_TO_FSB(args.mp, - be32_to_cpu(agi->agi_seqno), args.agbno); + args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); args.type = XFS_ALLOCTYPE_THIS_BNO; args.mod = args.total = args.wasdel = args.isfl = args.userdata = args.minalignslop = 0; @@ -258,8 +329,7 @@ xfs_ialloc_ag_alloc( * For now, just allocate blocks up front. */ args.agbno = be32_to_cpu(agi->agi_root); - args.fsbno = XFS_AGB_TO_FSB(args.mp, - be32_to_cpu(agi->agi_seqno), args.agbno); + args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); /* * Allocate a fixed-size extent of inodes. */ @@ -282,8 +352,7 @@ xfs_ialloc_ag_alloc( if (isaligned && args.fsbno == NULLFSBLOCK) { args.type = XFS_ALLOCTYPE_NEAR_BNO; args.agbno = be32_to_cpu(agi->agi_root); - args.fsbno = XFS_AGB_TO_FSB(args.mp, - be32_to_cpu(agi->agi_seqno), args.agbno); + args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); args.alignment = xfs_ialloc_cluster_alignment(&args); if ((error = xfs_alloc_vextent(&args))) return error; @@ -294,85 +363,30 @@ xfs_ialloc_ag_alloc( return 0; } ASSERT(args.len == args.minlen); - /* - * Convert the results. - */ - newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0); - /* - * Loop over the new block(s), filling in the inodes. - * For small block sizes, manipulate the inodes in buffers - * which are multiples of the blocks size. - */ - if (args.mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(args.mp)) { - blks_per_cluster = 1; - nbufs = (int)args.len; - ninodes = args.mp->m_sb.sb_inopblock; - } else { - blks_per_cluster = XFS_INODE_CLUSTER_SIZE(args.mp) / - args.mp->m_sb.sb_blocksize; - nbufs = (int)args.len / blks_per_cluster; - ninodes = blks_per_cluster * args.mp->m_sb.sb_inopblock; - } - /* - * Figure out what version number to use in the inodes we create. - * If the superblock version has caught up to the one that supports - * the new inode format, then use the new inode version. Otherwise - * use the old version so that old kernels will continue to be - * able to use the file system. - */ - if (xfs_sb_version_hasnlink(&args.mp->m_sb)) - version = 2; - else - version = 1; /* + * Stamp and write the inode buffers. + * * Seed the new inode cluster with a random generation number. This * prevents short-term reuse of generation numbers if a chunk is * freed and then immediately reallocated. We use random numbers * rather than a linear progression to prevent the next generation * number from being easily guessable. */ - gen = random32(); - for (j = 0; j < nbufs; j++) { - /* - * Get the block. - */ - d = XFS_AGB_TO_DADDR(args.mp, be32_to_cpu(agi->agi_seqno), - args.agbno + (j * blks_per_cluster)); - fbuf = xfs_trans_get_buf(tp, args.mp->m_ddev_targp, d, - args.mp->m_bsize * blks_per_cluster, - XFS_BUF_LOCK); - ASSERT(fbuf); - ASSERT(!XFS_BUF_GETERROR(fbuf)); + xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, args.len, + random32()); - /* - * Initialize all inodes in this buffer and then log them. - * - * XXX: It would be much better if we had just one transaction to - * log a whole cluster of inodes instead of all the individual - * transactions causing a lot of log traffic. - */ - xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog); - for (i = 0; i < ninodes; i++) { - int ioffset = i << args.mp->m_sb.sb_inodelog; - uint isize = sizeof(struct xfs_dinode); - - free = xfs_make_iptr(args.mp, fbuf, i); - free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); - free->di_version = version; - free->di_gen = cpu_to_be32(gen); - free->di_next_unlinked = cpu_to_be32(NULLAGINO); - xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1); - } - xfs_trans_inode_alloc_buf(tp, fbuf); - } + /* + * Convert the results. + */ + newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0); be32_add_cpu(&agi->agi_count, newlen); be32_add_cpu(&agi->agi_freecount, newlen); - agno = be32_to_cpu(agi->agi_seqno); down_read(&args.mp->m_peraglock); args.mp->m_perag[agno].pagi_freecount += newlen; up_read(&args.mp->m_peraglock); agi->agi_newino = cpu_to_be32(newino); + /* * Insert records describing the new inode chunk into the btree. */ @@ -380,13 +394,17 @@ xfs_ialloc_ag_alloc( for (thisino = newino; thisino < newino + newlen; thisino += XFS_INODES_PER_CHUNK) { - if ((error = xfs_inobt_lookup_eq(cur, thisino, - XFS_INODES_PER_CHUNK, XFS_INOBT_ALL_FREE, &i))) { + cur->bc_rec.i.ir_startino = thisino; + cur->bc_rec.i.ir_freecount = XFS_INODES_PER_CHUNK; + cur->bc_rec.i.ir_free = XFS_INOBT_ALL_FREE; + error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, &i); + if (error) { xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } ASSERT(i == 0); - if ((error = xfs_btree_insert(cur, &i))) { + error = xfs_btree_insert(cur, &i); + if (error) { xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } @@ -539,6 +557,62 @@ nextag: } /* + * Try to retrieve the next record to the left/right from the current one. + */ +STATIC int +xfs_ialloc_next_rec( + struct xfs_btree_cur *cur, + xfs_inobt_rec_incore_t *rec, + int *done, + int left) +{ + int error; + int i; + + if (left) + error = xfs_btree_decrement(cur, 0, &i); + else + error = xfs_btree_increment(cur, 0, &i); + + if (error) + return error; + *done = !i; + if (i) { + error = xfs_inobt_get_rec(cur, rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + } + + return 0; +} + +STATIC int +xfs_ialloc_get_rec( + struct xfs_btree_cur *cur, + xfs_agino_t agino, + xfs_inobt_rec_incore_t *rec, + int *done, + int left) +{ + int error; + int i; + + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i); + if (error) + return error; + *done = !i; + if (i) { + error = xfs_inobt_get_rec(cur, rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + } + + return 0; +} + +/* * Visible inode allocation functions. */ @@ -592,8 +666,8 @@ xfs_dialloc( int j; /* result code */ xfs_mount_t *mp; /* file system mount structure */ int offset; /* index of inode in chunk */ - xfs_agino_t pagino; /* parent's a.g. relative inode # */ - xfs_agnumber_t pagno; /* parent's allocation group number */ + xfs_agino_t pagino; /* parent's AG relative inode # */ + xfs_agnumber_t pagno; /* parent's AG number */ xfs_inobt_rec_incore_t rec; /* inode allocation record */ xfs_agnumber_t tagno; /* testing allocation group number */ xfs_btree_cur_t *tcur; /* temp cursor */ @@ -716,6 +790,8 @@ nextag: */ agno = tagno; *IO_agbp = NULL; + + restart_pagno: cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno)); /* * If pagino is 0 (this is the root inode allocation) use newino. @@ -723,220 +799,199 @@ nextag: */ if (!pagino) pagino = be32_to_cpu(agi->agi_newino); -#ifdef DEBUG - if (cur->bc_nlevels == 1) { - int freecount = 0; - if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - do { - if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, - &rec.ir_freecount, &rec.ir_free, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - freecount += rec.ir_freecount; - if ((error = xfs_btree_increment(cur, 0, &i))) - goto error0; - } while (i == 1); + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error0; - ASSERT(freecount == be32_to_cpu(agi->agi_freecount) || - XFS_FORCED_SHUTDOWN(mp)); - } -#endif /* - * If in the same a.g. as the parent, try to get near the parent. + * If in the same AG as the parent, try to get near the parent. */ if (pagno == agno) { - if ((error = xfs_inobt_lookup_le(cur, pagino, 0, 0, &i))) + xfs_perag_t *pag = &mp->m_perag[agno]; + int doneleft; /* done, to the left */ + int doneright; /* done, to the right */ + int searchdistance = 10; + + error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + error = xfs_inobt_get_rec(cur, &rec, &j); + if (error) goto error0; - if (i != 0 && - (error = xfs_inobt_get_rec(cur, &rec.ir_startino, - &rec.ir_freecount, &rec.ir_free, &j)) == 0 && - j == 1 && - rec.ir_freecount > 0) { + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + if (rec.ir_freecount > 0) { /* * Found a free inode in the same chunk - * as parent, done. + * as the parent, done. */ + goto alloc_inode; } + + + /* + * In the same AG as parent, but parent's chunk is full. + */ + + /* duplicate the cursor, search left & right simultaneously */ + error = xfs_btree_dup_cursor(cur, &tcur); + if (error) + goto error0; + /* - * In the same a.g. as parent, but parent's chunk is full. + * Skip to last blocks looked up if same parent inode. */ - else { - int doneleft; /* done, to the left */ - int doneright; /* done, to the right */ + if (pagino != NULLAGINO && + pag->pagl_pagino == pagino && + pag->pagl_leftrec != NULLAGINO && + pag->pagl_rightrec != NULLAGINO) { + error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec, + &trec, &doneleft, 1); + if (error) + goto error1; + error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec, + &rec, &doneright, 0); if (error) - goto error0; - ASSERT(i == 1); - ASSERT(j == 1); - /* - * Duplicate the cursor, search left & right - * simultaneously. - */ - if ((error = xfs_btree_dup_cursor(cur, &tcur))) - goto error0; - /* - * Search left with tcur, back up 1 record. - */ - if ((error = xfs_btree_decrement(tcur, 0, &i))) goto error1; - doneleft = !i; - if (!doneleft) { - if ((error = xfs_inobt_get_rec(tcur, - &trec.ir_startino, - &trec.ir_freecount, - &trec.ir_free, &i))) - goto error1; - XFS_WANT_CORRUPTED_GOTO(i == 1, error1); - } - /* - * Search right with cur, go forward 1 record. - */ - if ((error = xfs_btree_increment(cur, 0, &i))) + } else { + /* search left with tcur, back up 1 record */ + error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1); + if (error) goto error1; - doneright = !i; - if (!doneright) { - if ((error = xfs_inobt_get_rec(cur, - &rec.ir_startino, - &rec.ir_freecount, - &rec.ir_free, &i))) - goto error1; - XFS_WANT_CORRUPTED_GOTO(i == 1, error1); - } - /* - * Loop until we find the closest inode chunk - * with a free one. - */ - while (!doneleft || !doneright) { - int useleft; /* using left inode - chunk this time */ + /* search right with cur, go forward 1 record. */ + error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0); + if (error) + goto error1; + } + + /* + * Loop until we find an inode chunk with a free inode. + */ + while (!doneleft || !doneright) { + int useleft; /* using left inode chunk this time */ + + if (!--searchdistance) { /* - * Figure out which block is closer, - * if both are valid. - */ - if (!doneleft && !doneright) - useleft = - pagino - - (trec.ir_startino + - XFS_INODES_PER_CHUNK - 1) < - rec.ir_startino - pagino; - else - useleft = !doneleft; - /* - * If checking the left, does it have - * free inodes? - */ - if (useleft && trec.ir_freecount) { - /* - * Yes, set it up as the chunk to use. - */ - rec = trec; - xfs_btree_del_cursor(cur, - XFS_BTREE_NOERROR); - cur = tcur; - break; - } - /* - * If checking the right, does it have - * free inodes? - */ - if (!useleft && rec.ir_freecount) { - /* - * Yes, it's already set up. - */ - xfs_btree_del_cursor(tcur, - XFS_BTREE_NOERROR); - break; - } - /* - * If used the left, get another one - * further left. - */ - if (useleft) { - if ((error = xfs_btree_decrement(tcur, 0, - &i))) - goto error1; - doneleft = !i; - if (!doneleft) { - if ((error = xfs_inobt_get_rec( - tcur, - &trec.ir_startino, - &trec.ir_freecount, - &trec.ir_free, &i))) - goto error1; - XFS_WANT_CORRUPTED_GOTO(i == 1, - error1); - } - } - /* - * If used the right, get another one - * further right. + * Not in range - save last search + * location and allocate a new inode */ - else { - if ((error = xfs_btree_increment(cur, 0, - &i))) - goto error1; - doneright = !i; - if (!doneright) { - if ((error = xfs_inobt_get_rec( - cur, - &rec.ir_startino, - &rec.ir_freecount, - &rec.ir_free, &i))) - goto error1; - XFS_WANT_CORRUPTED_GOTO(i == 1, - error1); - } - } + pag->pagl_leftrec = trec.ir_startino; + pag->pagl_rightrec = rec.ir_startino; + pag->pagl_pagino = pagino; + goto newino; + } + + /* figure out the closer block if both are valid. */ + if (!doneleft && !doneright) { + useleft = pagino - + (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) < + rec.ir_startino - pagino; + } else { + useleft = !doneleft; } - ASSERT(!doneleft || !doneright); + + /* free inodes to the left? */ + if (useleft && trec.ir_freecount) { + rec = trec; + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + cur = tcur; + + pag->pagl_leftrec = trec.ir_startino; + pag->pagl_rightrec = rec.ir_startino; + pag->pagl_pagino = pagino; + goto alloc_inode; + } + + /* free inodes to the right? */ + if (!useleft && rec.ir_freecount) { + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + + pag->pagl_leftrec = trec.ir_startino; + pag->pagl_rightrec = rec.ir_startino; + pag->pagl_pagino = pagino; + goto alloc_inode; + } + + /* get next record to check */ + if (useleft) { + error = xfs_ialloc_next_rec(tcur, &trec, + &doneleft, 1); + } else { + error = xfs_ialloc_next_rec(cur, &rec, + &doneright, 0); + } + if (error) + goto error1; } + + /* + * We've reached the end of the btree. because + * we are only searching a small chunk of the + * btree each search, there is obviously free + * inodes closer to the parent inode than we + * are now. restart the search again. + */ + pag->pagl_pagino = NULLAGINO; + pag->pagl_leftrec = NULLAGINO; + pag->pagl_rightrec = NULLAGINO; + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + goto restart_pagno; } + /* - * In a different a.g. from the parent. + * In a different AG from the parent. * See if the most recently allocated block has any free. */ - else if (be32_to_cpu(agi->agi_newino) != NULLAGINO) { - if ((error = xfs_inobt_lookup_eq(cur, - be32_to_cpu(agi->agi_newino), 0, 0, &i))) +newino: + if (be32_to_cpu(agi->agi_newino) != NULLAGINO) { + error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino), + XFS_LOOKUP_EQ, &i); + if (error) goto error0; - if (i == 1 && - (error = xfs_inobt_get_rec(cur, &rec.ir_startino, - &rec.ir_freecount, &rec.ir_free, &j)) == 0 && - j == 1 && - rec.ir_freecount > 0) { - /* - * The last chunk allocated in the group still has - * a free inode. - */ - } - /* - * None left in the last group, search the whole a.g. - */ - else { + + if (i == 1) { + error = xfs_inobt_get_rec(cur, &rec, &j); if (error) goto error0; - if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) - goto error0; - ASSERT(i == 1); - for (;;) { - if ((error = xfs_inobt_get_rec(cur, - &rec.ir_startino, - &rec.ir_freecount, &rec.ir_free, - &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if (rec.ir_freecount > 0) - break; - if ((error = xfs_btree_increment(cur, 0, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + if (j == 1 && rec.ir_freecount > 0) { + /* + * The last chunk allocated in the group + * still has a free inode. + */ + goto alloc_inode; } } } + + /* + * None left in the last group, search the whole AG + */ + error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + for (;;) { + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if (rec.ir_freecount > 0) + break; + error = xfs_btree_increment(cur, 0, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + } + +alloc_inode: offset = xfs_ialloc_find_free(&rec.ir_free); ASSERT(offset >= 0); ASSERT(offset < XFS_INODES_PER_CHUNK); @@ -945,33 +1000,19 @@ nextag: ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset); rec.ir_free &= ~XFS_INOBT_MASK(offset); rec.ir_freecount--; - if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, - rec.ir_free))) + error = xfs_inobt_update(cur, &rec); + if (error) goto error0; be32_add_cpu(&agi->agi_freecount, -1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); down_read(&mp->m_peraglock); mp->m_perag[tagno].pagi_freecount--; up_read(&mp->m_peraglock); -#ifdef DEBUG - if (cur->bc_nlevels == 1) { - int freecount = 0; - if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) - goto error0; - do { - if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, - &rec.ir_freecount, &rec.ir_free, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - freecount += rec.ir_freecount; - if ((error = xfs_btree_increment(cur, 0, &i))) - goto error0; - } while (i == 1); - ASSERT(freecount == be32_to_cpu(agi->agi_freecount) || - XFS_FORCED_SHUTDOWN(mp)); - } -#endif + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error0; + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); *inop = ino; @@ -1062,38 +1103,23 @@ xfs_difree( * Initialize the cursor. */ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); -#ifdef DEBUG - if (cur->bc_nlevels == 1) { - int freecount = 0; - if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) - goto error0; - do { - if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, - &rec.ir_freecount, &rec.ir_free, &i))) - goto error0; - if (i) { - freecount += rec.ir_freecount; - if ((error = xfs_btree_increment(cur, 0, &i))) - goto error0; - } - } while (i == 1); - ASSERT(freecount == be32_to_cpu(agi->agi_freecount) || - XFS_FORCED_SHUTDOWN(mp)); - } -#endif + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error0; + /* * Look for the entry describing this inode. */ - if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) { + if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) { cmn_err(CE_WARN, - "xfs_difree: xfs_inobt_lookup_le returned() an error %d on %s. Returning error.", + "xfs_difree: xfs_inobt_lookup returned() an error %d on %s. Returning error.", error, mp->m_fsname); goto error0; } XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, &rec.ir_freecount, - &rec.ir_free, &i))) { + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) { cmn_err(CE_WARN, "xfs_difree: xfs_inobt_get_rec() returned an error %d on %s. Returning error.", error, mp->m_fsname); @@ -1148,12 +1174,14 @@ xfs_difree( } else { *delete = 0; - if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, rec.ir_free))) { + error = xfs_inobt_update(cur, &rec); + if (error) { cmn_err(CE_WARN, - "xfs_difree: xfs_inobt_update() returned an error %d on %s. Returning error.", + "xfs_difree: xfs_inobt_update returned an error %d on %s.", error, mp->m_fsname); goto error0; } + /* * Change the inode free counts and log the ag/sb changes. */ @@ -1165,28 +1193,10 @@ xfs_difree( xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1); } -#ifdef DEBUG - if (cur->bc_nlevels == 1) { - int freecount = 0; + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error0; - if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) - goto error0; - do { - if ((error = xfs_inobt_get_rec(cur, - &rec.ir_startino, - &rec.ir_freecount, - &rec.ir_free, &i))) - goto error0; - if (i) { - freecount += rec.ir_freecount; - if ((error = xfs_btree_increment(cur, 0, &i))) - goto error0; - } - } while (i == 1); - ASSERT(freecount == be32_to_cpu(agi->agi_freecount) || - XFS_FORCED_SHUTDOWN(mp)); - } -#endif xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); return 0; @@ -1297,9 +1307,7 @@ xfs_imap( chunk_agbno = agbno - offset_agbno; } else { xfs_btree_cur_t *cur; /* inode btree cursor */ - xfs_agino_t chunk_agino; /* first agino in inode chunk */ - __int32_t chunk_cnt; /* count of free inodes in chunk */ - xfs_inofree_t chunk_free; /* mask of free inodes in chunk */ + xfs_inobt_rec_incore_t chunk_rec; xfs_buf_t *agbp; /* agi buffer */ int i; /* temp state */ @@ -1315,15 +1323,14 @@ xfs_imap( } cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); - error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i); + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i); if (error) { xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " - "xfs_inobt_lookup_le() failed"); + "xfs_inobt_lookup() failed"); goto error0; } - error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt, - &chunk_free, &i); + error = xfs_inobt_get_rec(cur, &chunk_rec, &i); if (error) { xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " "xfs_inobt_get_rec() failed"); @@ -1341,7 +1348,7 @@ xfs_imap( xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); if (error) return error; - chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_agino); + chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_rec.ir_startino); offset_agbno = agbno - chunk_agbno; } diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h index aeee8278f92c..bb5385475e1f 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/xfs_ialloc.h @@ -150,23 +150,15 @@ xfs_ialloc_pagi_init( xfs_agnumber_t agno); /* allocation group number */ /* - * Lookup the first record greater than or equal to ino - * in the btree given by cur. + * Lookup a record by ino in the btree given by cur. */ -int xfs_inobt_lookup_ge(struct xfs_btree_cur *cur, xfs_agino_t ino, - __int32_t fcnt, xfs_inofree_t free, int *stat); - -/* - * Lookup the first record less than or equal to ino - * in the btree given by cur. - */ -int xfs_inobt_lookup_le(struct xfs_btree_cur *cur, xfs_agino_t ino, - __int32_t fcnt, xfs_inofree_t free, int *stat); +int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino, + xfs_lookup_t dir, int *stat); /* * Get the data from the pointed-to record. */ -extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_agino_t *ino, - __int32_t *fcnt, xfs_inofree_t *free, int *stat); +extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, + xfs_inobt_rec_incore_t *rec, int *stat); #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 5fcec6f020a7..80e526489be5 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -64,6 +64,10 @@ xfs_inode_alloc( ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); if (!ip) return NULL; + if (inode_init_always(mp->m_super, VFS_I(ip))) { + kmem_zone_free(xfs_inode_zone, ip); + return NULL; + } ASSERT(atomic_read(&ip->i_iocount) == 0); ASSERT(atomic_read(&ip->i_pincount) == 0); @@ -78,7 +82,6 @@ xfs_inode_alloc( memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); ip->i_flags = 0; ip->i_update_core = 0; - ip->i_update_size = 0; ip->i_delayed_blks = 0; memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); ip->i_size = 0; @@ -105,17 +108,6 @@ xfs_inode_alloc( #ifdef XFS_DIR2_TRACE ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS); #endif - /* - * Now initialise the VFS inode. We do this after the xfs_inode - * initialisation as internal failures will result in ->destroy_inode - * being called and that will pass down through the reclaim path and - * free the XFS inode. This path requires the XFS inode to already be - * initialised. Hence if this call fails, the xfs_inode has already - * been freed and we should not reference it at all in the error - * handling. - */ - if (!inode_init_always(mp->m_super, VFS_I(ip))) - return NULL; /* prevent anyone from using this yet */ VFS_I(ip)->i_state = I_NEW|I_LOCK; @@ -123,6 +115,71 @@ xfs_inode_alloc( return ip; } +STATIC void +xfs_inode_free( + struct xfs_inode *ip) +{ + switch (ip->i_d.di_mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFLNK: + xfs_idestroy_fork(ip, XFS_DATA_FORK); + break; + } + + if (ip->i_afp) + xfs_idestroy_fork(ip, XFS_ATTR_FORK); + +#ifdef XFS_INODE_TRACE + ktrace_free(ip->i_trace); +#endif +#ifdef XFS_BMAP_TRACE + ktrace_free(ip->i_xtrace); +#endif +#ifdef XFS_BTREE_TRACE + ktrace_free(ip->i_btrace); +#endif +#ifdef XFS_RW_TRACE + ktrace_free(ip->i_rwtrace); +#endif +#ifdef XFS_ILOCK_TRACE + ktrace_free(ip->i_lock_trace); +#endif +#ifdef XFS_DIR2_TRACE + ktrace_free(ip->i_dir_trace); +#endif + + if (ip->i_itemp) { + /* + * Only if we are shutting down the fs will we see an + * inode still in the AIL. If it is there, we should remove + * it to prevent a use-after-free from occurring. + */ + xfs_log_item_t *lip = &ip->i_itemp->ili_item; + struct xfs_ail *ailp = lip->li_ailp; + + ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) || + XFS_FORCED_SHUTDOWN(ip->i_mount)); + if (lip->li_flags & XFS_LI_IN_AIL) { + spin_lock(&ailp->xa_lock); + if (lip->li_flags & XFS_LI_IN_AIL) + xfs_trans_ail_delete(ailp, lip); + else + spin_unlock(&ailp->xa_lock); + } + xfs_inode_item_destroy(ip); + ip->i_itemp = NULL; + } + + /* asserts to verify all state is correct here */ + ASSERT(atomic_read(&ip->i_iocount) == 0); + ASSERT(atomic_read(&ip->i_pincount) == 0); + ASSERT(!spin_is_locked(&ip->i_flags_lock)); + ASSERT(completion_done(&ip->i_flush)); + + kmem_zone_free(xfs_inode_zone, ip); +} + /* * Check the validity of the inode we just found it the cache */ @@ -133,80 +190,82 @@ xfs_iget_cache_hit( int flags, int lock_flags) __releases(pag->pag_ici_lock) { + struct inode *inode = VFS_I(ip); struct xfs_mount *mp = ip->i_mount; - int error = EAGAIN; + int error; + + spin_lock(&ip->i_flags_lock); /* - * If INEW is set this inode is being set up - * If IRECLAIM is set this inode is being torn down - * Pause and try again. + * If we are racing with another cache hit that is currently + * instantiating this inode or currently recycling it out of + * reclaimabe state, wait for the initialisation to complete + * before continuing. + * + * XXX(hch): eventually we should do something equivalent to + * wait_on_inode to wait for these flags to be cleared + * instead of polling for it. */ - if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) { + if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { XFS_STATS_INC(xs_ig_frecycle); + error = EAGAIN; goto out_error; } - /* If IRECLAIMABLE is set, we've torn down the vfs inode part */ - if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) { - - /* - * If lookup is racing with unlink, then we should return an - * error immediately so we don't remove it from the reclaim - * list and potentially leak the inode. - */ - if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { - error = ENOENT; - goto out_error; - } + /* + * If lookup is racing with unlink return an error immediately. + */ + if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { + error = ENOENT; + goto out_error; + } + /* + * If IRECLAIMABLE is set, we've torn down the VFS inode already. + * Need to carefully get it back into useable state. + */ + if (ip->i_flags & XFS_IRECLAIMABLE) { xfs_itrace_exit_tag(ip, "xfs_iget.alloc"); /* - * We need to re-initialise the VFS inode as it has been - * 'freed' by the VFS. Do this here so we can deal with - * errors cleanly, then tag it so it can be set up correctly - * later. + * We need to set XFS_INEW atomically with clearing the + * reclaimable tag so that we do have an indicator of the + * inode still being initialized. */ - if (!inode_init_always(mp->m_super, VFS_I(ip))) { - error = ENOMEM; - goto out_error; - } + ip->i_flags |= XFS_INEW; + ip->i_flags &= ~XFS_IRECLAIMABLE; + __xfs_inode_clear_reclaim_tag(mp, pag, ip); - /* - * We must set the XFS_INEW flag before clearing the - * XFS_IRECLAIMABLE flag so that if a racing lookup does - * not find the XFS_IRECLAIMABLE above but has the igrab() - * below succeed we can safely check XFS_INEW to detect - * that this inode is still being initialised. - */ - xfs_iflags_set(ip, XFS_INEW); - xfs_iflags_clear(ip, XFS_IRECLAIMABLE); + spin_unlock(&ip->i_flags_lock); + read_unlock(&pag->pag_ici_lock); - /* clear the radix tree reclaim flag as well. */ - __xfs_inode_clear_reclaim_tag(mp, pag, ip); - } else if (!igrab(VFS_I(ip))) { + error = -inode_init_always(mp->m_super, inode); + if (error) { + /* + * Re-initializing the inode failed, and we are in deep + * trouble. Try to re-add it to the reclaim list. + */ + read_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + + ip->i_flags &= ~XFS_INEW; + ip->i_flags |= XFS_IRECLAIMABLE; + __xfs_inode_set_reclaim_tag(pag, ip); + goto out_error; + } + inode->i_state = I_LOCK|I_NEW; + } else { /* If the VFS inode is being torn down, pause and try again. */ - XFS_STATS_INC(xs_ig_frecycle); - goto out_error; - } else if (xfs_iflags_test(ip, XFS_INEW)) { - /* - * We are racing with another cache hit that is - * currently recycling this inode out of the XFS_IRECLAIMABLE - * state. Wait for the initialisation to complete before - * continuing. - */ - wait_on_inode(VFS_I(ip)); - } + if (!igrab(inode)) { + error = EAGAIN; + goto out_error; + } - if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { - error = ENOENT; - iput(VFS_I(ip)); - goto out_error; + /* We've got a live one. */ + spin_unlock(&ip->i_flags_lock); + read_unlock(&pag->pag_ici_lock); } - /* We've got a live one. */ - read_unlock(&pag->pag_ici_lock); - if (lock_flags != 0) xfs_ilock(ip, lock_flags); @@ -216,6 +275,7 @@ xfs_iget_cache_hit( return 0; out_error: + spin_unlock(&ip->i_flags_lock); read_unlock(&pag->pag_ici_lock); return error; } @@ -299,7 +359,8 @@ out_preload_end: if (lock_flags) xfs_iunlock(ip, lock_flags); out_destroy: - xfs_destroy_inode(ip); + __destroy_inode(VFS_I(ip)); + xfs_inode_free(ip); return error; } @@ -394,32 +455,6 @@ out_error_or_again: return error; } - -/* - * Look for the inode corresponding to the given ino in the hash table. - * If it is there and its i_transp pointer matches tp, return it. - * Otherwise, return NULL. - */ -xfs_inode_t * -xfs_inode_incore(xfs_mount_t *mp, - xfs_ino_t ino, - xfs_trans_t *tp) -{ - xfs_inode_t *ip; - xfs_perag_t *pag; - - pag = xfs_get_perag(mp, ino); - read_lock(&pag->pag_ici_lock); - ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ino)); - read_unlock(&pag->pag_ici_lock); - xfs_put_perag(mp, pag); - - /* the returned inode must match the transaction */ - if (ip && (ip->i_transp != tp)) - return NULL; - return ip; -} - /* * Decrement reference count of an inode structure and unlock it. * @@ -504,62 +539,7 @@ xfs_ireclaim( xfs_qm_dqdetach(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - switch (ip->i_d.di_mode & S_IFMT) { - case S_IFREG: - case S_IFDIR: - case S_IFLNK: - xfs_idestroy_fork(ip, XFS_DATA_FORK); - break; - } - - if (ip->i_afp) - xfs_idestroy_fork(ip, XFS_ATTR_FORK); - -#ifdef XFS_INODE_TRACE - ktrace_free(ip->i_trace); -#endif -#ifdef XFS_BMAP_TRACE - ktrace_free(ip->i_xtrace); -#endif -#ifdef XFS_BTREE_TRACE - ktrace_free(ip->i_btrace); -#endif -#ifdef XFS_RW_TRACE - ktrace_free(ip->i_rwtrace); -#endif -#ifdef XFS_ILOCK_TRACE - ktrace_free(ip->i_lock_trace); -#endif -#ifdef XFS_DIR2_TRACE - ktrace_free(ip->i_dir_trace); -#endif - if (ip->i_itemp) { - /* - * Only if we are shutting down the fs will we see an - * inode still in the AIL. If it is there, we should remove - * it to prevent a use-after-free from occurring. - */ - xfs_log_item_t *lip = &ip->i_itemp->ili_item; - struct xfs_ail *ailp = lip->li_ailp; - - ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) || - XFS_FORCED_SHUTDOWN(ip->i_mount)); - if (lip->li_flags & XFS_LI_IN_AIL) { - spin_lock(&ailp->xa_lock); - if (lip->li_flags & XFS_LI_IN_AIL) - xfs_trans_ail_delete(ailp, lip); - else - spin_unlock(&ailp->xa_lock); - } - xfs_inode_item_destroy(ip); - ip->i_itemp = NULL; - } - /* asserts to verify all state is correct here */ - ASSERT(atomic_read(&ip->i_iocount) == 0); - ASSERT(atomic_read(&ip->i_pincount) == 0); - ASSERT(!spin_is_locked(&ip->i_flags_lock)); - ASSERT(completion_done(&ip->i_flush)); - kmem_zone_free(xfs_inode_zone, ip); + xfs_inode_free(ip); } /* diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 1f22d65fed0a..c1dc7ef5a1d8 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -343,6 +343,16 @@ xfs_iformat( return XFS_ERROR(EFSCORRUPTED); } + if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) && + !ip->i_mount->m_rtdev_targp)) { + xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, + "corrupt dinode %Lu, has realtime flag set.", + ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_iformat(realtime)", + XFS_ERRLEVEL_LOW, ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + switch (ip->i_d.di_mode & S_IFMT) { case S_IFIFO: case S_IFCHR: @@ -641,7 +651,7 @@ xfs_iformat_btree( return 0; } -void +STATIC void xfs_dinode_from_disk( xfs_icdinode_t *to, xfs_dinode_t *from) @@ -1237,7 +1247,7 @@ xfs_isize_check( * In that case the pages will still be in memory, but the inode size * will never have been updated. */ -xfs_fsize_t +STATIC xfs_fsize_t xfs_file_last_byte( xfs_inode_t *ip) { @@ -3827,7 +3837,7 @@ xfs_iext_inline_to_direct( /* * Resize an extent indirection array to new_size bytes. */ -void +STATIC void xfs_iext_realloc_indirect( xfs_ifork_t *ifp, /* inode fork pointer */ int new_size) /* new indirection array size */ @@ -3852,7 +3862,7 @@ xfs_iext_realloc_indirect( /* * Switch from indirection array to linear (direct) extent allocations. */ -void +STATIC void xfs_iext_indirect_to_direct( xfs_ifork_t *ifp) /* inode fork pointer */ { diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 1804f866a71d..0b38b9a869ec 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -261,7 +261,6 @@ typedef struct xfs_inode { /* Miscellaneous state. */ unsigned short i_flags; /* see defined flags below */ unsigned char i_update_core; /* timestamps/size is dirty */ - unsigned char i_update_size; /* di_size field is dirty */ unsigned int i_delayed_blks; /* count of delay alloc blks */ xfs_icdinode_t i_d; /* most of ondisk inode */ @@ -310,23 +309,6 @@ static inline struct inode *VFS_I(struct xfs_inode *ip) } /* - * Get rid of a partially initialized inode. - * - * We have to go through destroy_inode to make sure allocations - * from init_inode_always like the security data are undone. - * - * We mark the inode bad so that it takes the short cut in - * the reclaim path instead of going through the flush path - * which doesn't make sense for an inode that has never seen the - * light of day. - */ -static inline void xfs_destroy_inode(struct xfs_inode *ip) -{ - make_bad_inode(VFS_I(ip)); - return destroy_inode(VFS_I(ip)); -} - -/* * i_flags helper functions */ static inline void @@ -485,8 +467,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip) /* * xfs_iget.c prototypes. */ -xfs_inode_t *xfs_inode_incore(struct xfs_mount *, xfs_ino_t, - struct xfs_trans *); int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, uint, uint, xfs_inode_t **, xfs_daddr_t); void xfs_iput(xfs_inode_t *, uint); @@ -521,7 +501,6 @@ void xfs_ipin(xfs_inode_t *); void xfs_iunpin(xfs_inode_t *); int xfs_iflush(xfs_inode_t *, uint); void xfs_ichgtime(xfs_inode_t *, int); -xfs_fsize_t xfs_file_last_byte(xfs_inode_t *); void xfs_lock_inodes(xfs_inode_t **, int, uint); void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); @@ -589,8 +568,6 @@ int xfs_itobp(struct xfs_mount *, struct xfs_trans *, struct xfs_buf **, uint); int xfs_iread(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, xfs_daddr_t, uint); -void xfs_dinode_from_disk(struct xfs_icdinode *, - struct xfs_dinode *); void xfs_dinode_to_disk(struct xfs_dinode *, struct xfs_icdinode *); void xfs_idestroy_fork(struct xfs_inode *, int); @@ -609,8 +586,6 @@ void xfs_iext_remove_inline(xfs_ifork_t *, xfs_extnum_t, int); void xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int); void xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int); void xfs_iext_realloc_direct(xfs_ifork_t *, int); -void xfs_iext_realloc_indirect(xfs_ifork_t *, int); -void xfs_iext_indirect_to_direct(xfs_ifork_t *); void xfs_iext_direct_to_inline(xfs_ifork_t *, xfs_extnum_t); void xfs_iext_inline_to_direct(xfs_ifork_t *, int); void xfs_iext_destroy(xfs_ifork_t *); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 977c4aec587e..47d5b663c37e 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -263,14 +263,6 @@ xfs_inode_item_format( } /* - * We don't have to worry about re-ordering here because - * the update_size field is protected by the inode lock - * and we have that held in exclusive mode. - */ - if (ip->i_update_size) - ip->i_update_size = 0; - - /* * Make sure to get the latest atime from the Linux inode. */ xfs_synchronize_atime(ip); @@ -712,8 +704,6 @@ xfs_inode_item_unlock( * Clear out the fields of the inode log item particular * to the current transaction. */ - iip->ili_ilock_recur = 0; - iip->ili_iolock_recur = 0; iip->ili_flags = 0; /* diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index a52ac125f055..65bae4c9b8bf 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -137,8 +137,6 @@ typedef struct xfs_inode_log_item { struct xfs_inode *ili_inode; /* inode ptr */ xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ xfs_lsn_t ili_last_lsn; /* lsn at last transaction */ - unsigned short ili_ilock_recur; /* lock recursion count */ - unsigned short ili_iolock_recur; /* lock recursion count */ unsigned short ili_flags; /* misc flags */ unsigned short ili_logged; /* flushed logged data */ unsigned int ili_last_fields; /* fields when flushed */ diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/xfs_inum.h index 7a28191cb0de..b8e4ee4e89a4 100644 --- a/fs/xfs/xfs_inum.h +++ b/fs/xfs/xfs_inum.h @@ -72,7 +72,6 @@ struct xfs_mount; #if XFS_BIG_INUMS #define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL)) -#define XFS_INO64_OFFSET ((xfs_ino_t)(1ULL << 32)) #else #define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 32) - 1ULL)) #endif diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index aeb2d2221c7d..b68f9107e26c 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -39,7 +39,7 @@ #include "xfs_error.h" #include "xfs_btree.h" -int +STATIC int xfs_internal_inum( xfs_mount_t *mp, xfs_ino_t ino) @@ -353,9 +353,6 @@ xfs_bulkstat( int end_of_ag; /* set if we've seen the ag end */ int error; /* error code */ int fmterror;/* bulkstat formatter result */ - __int32_t gcnt; /* current btree rec's count */ - xfs_inofree_t gfree; /* current btree rec's free mask */ - xfs_agino_t gino; /* current btree rec's start inode */ int i; /* loop index */ int icount; /* count of inodes good in irbuf */ size_t irbsize; /* size of irec buffer in bytes */ @@ -442,40 +439,43 @@ xfs_bulkstat( * we need to get the remainder of the chunk we're in. */ if (agino > 0) { + xfs_inobt_rec_incore_t r; + /* * Lookup the inode chunk that this inode lives in. */ - error = xfs_inobt_lookup_le(cur, agino, 0, 0, &tmp); + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, + &tmp); if (!error && /* no I/O error */ tmp && /* lookup succeeded */ /* got the record, should always work */ - !(error = xfs_inobt_get_rec(cur, &gino, &gcnt, - &gfree, &i)) && + !(error = xfs_inobt_get_rec(cur, &r, &i)) && i == 1 && /* this is the right chunk */ - agino < gino + XFS_INODES_PER_CHUNK && + agino < r.ir_startino + XFS_INODES_PER_CHUNK && /* lastino was not last in chunk */ - (chunkidx = agino - gino + 1) < + (chunkidx = agino - r.ir_startino + 1) < XFS_INODES_PER_CHUNK && /* there are some left allocated */ xfs_inobt_maskn(chunkidx, - XFS_INODES_PER_CHUNK - chunkidx) & ~gfree) { + XFS_INODES_PER_CHUNK - chunkidx) & + ~r.ir_free) { /* * Grab the chunk record. Mark all the * uninteresting inodes (because they're * before our start point) free. */ for (i = 0; i < chunkidx; i++) { - if (XFS_INOBT_MASK(i) & ~gfree) - gcnt++; + if (XFS_INOBT_MASK(i) & ~r.ir_free) + r.ir_freecount++; } - gfree |= xfs_inobt_maskn(0, chunkidx); - irbp->ir_startino = gino; - irbp->ir_freecount = gcnt; - irbp->ir_free = gfree; + r.ir_free |= xfs_inobt_maskn(0, chunkidx); + irbp->ir_startino = r.ir_startino; + irbp->ir_freecount = r.ir_freecount; + irbp->ir_free = r.ir_free; irbp++; - agino = gino + XFS_INODES_PER_CHUNK; - icount = XFS_INODES_PER_CHUNK - gcnt; + agino = r.ir_startino + XFS_INODES_PER_CHUNK; + icount = XFS_INODES_PER_CHUNK - r.ir_freecount; } else { /* * If any of those tests failed, bump the @@ -493,7 +493,7 @@ xfs_bulkstat( /* * Start of ag. Lookup the first inode chunk. */ - error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &tmp); + error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp); icount = 0; } /* @@ -501,6 +501,8 @@ xfs_bulkstat( * until we run out of inodes or space in the buffer. */ while (irbp < irbufend && icount < ubcount) { + xfs_inobt_rec_incore_t r; + /* * Loop as long as we're unable to read the * inode btree. @@ -510,51 +512,55 @@ xfs_bulkstat( if (XFS_AGINO_TO_AGBNO(mp, agino) >= be32_to_cpu(agi->agi_length)) break; - error = xfs_inobt_lookup_ge(cur, agino, 0, 0, - &tmp); + error = xfs_inobt_lookup(cur, agino, + XFS_LOOKUP_GE, &tmp); cond_resched(); } /* * If ran off the end of the ag either with an error, * or the normal way, set end and stop collecting. */ - if (error || - (error = xfs_inobt_get_rec(cur, &gino, &gcnt, - &gfree, &i)) || - i == 0) { + if (error) { end_of_ag = 1; break; } + + error = xfs_inobt_get_rec(cur, &r, &i); + if (error || i == 0) { + end_of_ag = 1; + break; + } + /* * If this chunk has any allocated inodes, save it. * Also start read-ahead now for this chunk. */ - if (gcnt < XFS_INODES_PER_CHUNK) { + if (r.ir_freecount < XFS_INODES_PER_CHUNK) { /* * Loop over all clusters in the next chunk. * Do a readahead if there are any allocated * inodes in that cluster. */ - for (agbno = XFS_AGINO_TO_AGBNO(mp, gino), - chunkidx = 0; + agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino); + for (chunkidx = 0; chunkidx < XFS_INODES_PER_CHUNK; chunkidx += nicluster, agbno += nbcluster) { - if (xfs_inobt_maskn(chunkidx, - nicluster) & ~gfree) + if (xfs_inobt_maskn(chunkidx, nicluster) + & ~r.ir_free) xfs_btree_reada_bufs(mp, agno, agbno, nbcluster); } - irbp->ir_startino = gino; - irbp->ir_freecount = gcnt; - irbp->ir_free = gfree; + irbp->ir_startino = r.ir_startino; + irbp->ir_freecount = r.ir_freecount; + irbp->ir_free = r.ir_free; irbp++; - icount += XFS_INODES_PER_CHUNK - gcnt; + icount += XFS_INODES_PER_CHUNK - r.ir_freecount; } /* * Set agino to after this chunk and bump the cursor. */ - agino = gino + XFS_INODES_PER_CHUNK; + agino = r.ir_startino + XFS_INODES_PER_CHUNK; error = xfs_btree_increment(cur, 0, &tmp); cond_resched(); } @@ -820,9 +826,7 @@ xfs_inumbers( int bufidx; xfs_btree_cur_t *cur; int error; - __int32_t gcnt; - xfs_inofree_t gfree; - xfs_agino_t gino; + xfs_inobt_rec_incore_t r; int i; xfs_ino_t ino; int left; @@ -855,7 +859,8 @@ xfs_inumbers( continue; } cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno); - error = xfs_inobt_lookup_ge(cur, agino, 0, 0, &tmp); + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE, + &tmp); if (error) { xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); cur = NULL; @@ -870,9 +875,8 @@ xfs_inumbers( continue; } } - if ((error = xfs_inobt_get_rec(cur, &gino, &gcnt, &gfree, - &i)) || - i == 0) { + error = xfs_inobt_get_rec(cur, &r, &i); + if (error || i == 0) { xfs_buf_relse(agbp); agbp = NULL; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); @@ -881,10 +885,12 @@ xfs_inumbers( agino = 0; continue; } - agino = gino + XFS_INODES_PER_CHUNK - 1; - buffer[bufidx].xi_startino = XFS_AGINO_TO_INO(mp, agno, gino); - buffer[bufidx].xi_alloccount = XFS_INODES_PER_CHUNK - gcnt; - buffer[bufidx].xi_allocmask = ~gfree; + agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1; + buffer[bufidx].xi_startino = + XFS_AGINO_TO_INO(mp, agno, r.ir_startino); + buffer[bufidx].xi_alloccount = + XFS_INODES_PER_CHUNK - r.ir_freecount; + buffer[bufidx].xi_allocmask = ~r.ir_free; bufidx++; left--; if (bufidx == bcount) { diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index 1fb04e7deb61..20792bf45946 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -99,11 +99,6 @@ xfs_bulkstat_one( void *dibuff, int *stat); -int -xfs_internal_inum( - xfs_mount_t *mp, - xfs_ino_t ino); - typedef int (*inumbers_fmt_pf)( void __user *ubuffer, /* buffer to write to */ const xfs_inogrp_t *buffer, /* buffer to read from */ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 3750f04ede0b..9dbdff3ea484 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3180,7 +3180,7 @@ try_again: STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog) { - ASSERT(spin_is_locked(&log->l_icloglock)); + assert_spin_locked(&log->l_icloglock); if (iclog->ic_state == XLOG_STATE_ACTIVE) { xlog_state_switch_iclogs(log, iclog, 0); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index bcad5f4c1fd1..679c7c4926a2 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -451,8 +451,6 @@ extern int xlog_find_tail(xlog_t *log, extern int xlog_recover(xlog_t *log); extern int xlog_recover_finish(xlog_t *log); extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); -extern void xlog_recover_process_iunlinks(xlog_t *log); - extern struct xfs_buf *xlog_get_bp(xlog_t *, int); extern void xlog_put_bp(struct xfs_buf *); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 47da2fb45377..1099395d7d6c 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3263,7 +3263,7 @@ xlog_recover_process_one_iunlink( * freeing of the inode and its removal from the list must be * atomic. */ -void +STATIC void xlog_recover_process_iunlinks( xlog_t *log) { diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 5c6f092659c1..8b6c9e807efb 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1568,7 +1568,7 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields) * * The m_sb_lock must be held when this routine is called. */ -int +STATIC int xfs_mod_incore_sb_unlocked( xfs_mount_t *mp, xfs_sb_field_t field, diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index a5122382afde..a6c023bc0fb2 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -414,13 +414,10 @@ typedef struct xfs_mod_sb { extern int xfs_log_sbcount(xfs_mount_t *, uint); extern int xfs_mountfs(xfs_mount_t *mp); -extern void xfs_mountfs_check_barriers(xfs_mount_t *mp); extern void xfs_unmountfs(xfs_mount_t *); extern int xfs_unmountfs_writesb(xfs_mount_t *); extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int); -extern int xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t, - int64_t, int); extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, uint, int); extern int xfs_mount_log_sb(xfs_mount_t *, __int64_t); diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index afee7eb24323..4b0613d99faa 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -564,35 +564,6 @@ xfs_mru_cache_lookup( } /* - * To look up an element using its key, but leave its location in the internal - * lists alone, call xfs_mru_cache_peek(). If the element isn't found, this - * function returns NULL. - * - * See the comments above the declaration of the xfs_mru_cache_lookup() function - * for important locking information pertaining to this call. - */ -void * -xfs_mru_cache_peek( - xfs_mru_cache_t *mru, - unsigned long key) -{ - xfs_mru_cache_elem_t *elem; - - ASSERT(mru && mru->lists); - if (!mru || !mru->lists) - return NULL; - - spin_lock(&mru->lock); - elem = radix_tree_lookup(&mru->store, key); - if (!elem) - spin_unlock(&mru->lock); - else - __release(mru_lock); /* help sparse not be stupid */ - - return elem ? elem->value : NULL; -} - -/* * To release the internal data structure spinlock after having performed an * xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done() * with the data store pointer. diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h index dd58ea1bbebe..5d439f34b0c9 100644 --- a/fs/xfs/xfs_mru_cache.h +++ b/fs/xfs/xfs_mru_cache.h @@ -49,7 +49,6 @@ int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key, void * xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key); void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key); void *xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key); -void *xfs_mru_cache_peek(struct xfs_mru_cache *mru, unsigned long key); void xfs_mru_cache_done(struct xfs_mru_cache *mru); #endif /* __XFS_MRU_CACHE_H__ */ diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c index fea68615ed23..3f816ad7ff19 100644 --- a/fs/xfs/xfs_rw.c +++ b/fs/xfs/xfs_rw.c @@ -88,90 +88,6 @@ xfs_write_clear_setuid( } /* - * Handle logging requirements of various synchronous types of write. - */ -int -xfs_write_sync_logforce( - xfs_mount_t *mp, - xfs_inode_t *ip) -{ - int error = 0; - - /* - * If we're treating this as O_DSYNC and we have not updated the - * size, force the log. - */ - if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) && - !(ip->i_update_size)) { - xfs_inode_log_item_t *iip = ip->i_itemp; - - /* - * If an allocation transaction occurred - * without extending the size, then we have to force - * the log up the proper point to ensure that the - * allocation is permanent. We can't count on - * the fact that buffered writes lock out direct I/O - * writes - the direct I/O write could have extended - * the size nontransactionally, then finished before - * we started. xfs_write_file will think that the file - * didn't grow but the update isn't safe unless the - * size change is logged. - * - * Force the log if we've committed a transaction - * against the inode or if someone else has and - * the commit record hasn't gone to disk (e.g. - * the inode is pinned). This guarantees that - * all changes affecting the inode are permanent - * when we return. - */ - if (iip && iip->ili_last_lsn) { - error = _xfs_log_force(mp, iip->ili_last_lsn, - XFS_LOG_FORCE | XFS_LOG_SYNC, NULL); - } else if (xfs_ipincount(ip) > 0) { - error = _xfs_log_force(mp, (xfs_lsn_t)0, - XFS_LOG_FORCE | XFS_LOG_SYNC, NULL); - } - - } else { - xfs_trans_t *tp; - - /* - * O_SYNC or O_DSYNC _with_ a size update are handled - * the same way. - * - * If the write was synchronous then we need to make - * sure that the inode modification time is permanent. - * We'll have updated the timestamp above, so here - * we use a synchronous transaction to log the inode. - * It's not fast, but it's necessary. - * - * If this a dsync write and the size got changed - * non-transactionally, then we need to ensure that - * the size change gets logged in a synchronous - * transaction. - */ - tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC); - if ((error = xfs_trans_reserve(tp, 0, - XFS_SWRITE_LOG_RES(mp), - 0, 0, 0))) { - /* Transaction reserve failed */ - xfs_trans_cancel(tp, 0); - } else { - /* Transaction reserve successful */ - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - } - - return error; -} - -/* * Force a shutdown of the filesystem instantly while keeping * the filesystem consistent. We don't do an unmount here; just shutdown * the shop, make sure that absolutely nothing persistent happens to diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h index f76c003ec55d..f5e4874c37d8 100644 --- a/fs/xfs/xfs_rw.h +++ b/fs/xfs/xfs_rw.h @@ -68,7 +68,6 @@ xfs_get_extsz_hint( * Prototypes for functions in xfs_rw.c. */ extern int xfs_write_clear_setuid(struct xfs_inode *ip); -extern int xfs_write_sync_logforce(struct xfs_mount *mp, struct xfs_inode *ip); extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp); extern int xfs_bioerror(struct xfs_buf *bp); extern int xfs_bioerror_relse(struct xfs_buf *bp); @@ -78,10 +77,4 @@ extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp, extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp, xfs_buf_t *bp, xfs_daddr_t blkno); -/* - * Prototypes for functions in xfs_vnodeops.c. - */ -extern int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip, - int flags); - #endif /* __XFS_RW_H__ */ diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 775249a54f6f..ed47fc77759c 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -68,7 +68,7 @@ typedef struct xfs_trans_header { #define XFS_TRANS_GROWFS 14 #define XFS_TRANS_STRAT_WRITE 15 #define XFS_TRANS_DIOSTRAT 16 -#define XFS_TRANS_WRITE_SYNC 17 +/* 17 was XFS_TRANS_WRITE_SYNC */ #define XFS_TRANS_WRITEID 18 #define XFS_TRANS_ADDAFORK 19 #define XFS_TRANS_ATTRINVAL 20 diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 8ee2f8c8b0a6..218829e6a152 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -307,7 +307,7 @@ xfs_trans_read_buf( return (flags & XFS_BUF_TRYLOCK) ? EAGAIN : XFS_ERROR(ENOMEM); - if ((bp != NULL) && (XFS_BUF_GETERROR(bp) != 0)) { + if (XFS_BUF_GETERROR(bp) != 0) { xfs_ioerror_alert("xfs_trans_read_buf", mp, bp, blkno); error = XFS_BUF_GETERROR(bp); @@ -315,7 +315,7 @@ xfs_trans_read_buf( return error; } #ifdef DEBUG - if (xfs_do_error && (bp != NULL)) { + if (xfs_do_error) { if (xfs_error_target == target) { if (((xfs_req_num++) % xfs_error_mod) == 0) { xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index 23d276af2e0c..785ff101da0a 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -49,30 +49,7 @@ xfs_trans_inode_broot_debug( /* - * Get and lock the inode for the caller if it is not already - * locked within the given transaction. If it is already locked - * within the transaction, just increment its lock recursion count - * and return a pointer to it. - * - * For an inode to be locked in a transaction, the inode lock, as - * opposed to the io lock, must be taken exclusively. This ensures - * that the inode can be involved in only 1 transaction at a time. - * Lock recursion is handled on the io lock, but only for lock modes - * of equal or lesser strength. That is, you can recur on the io lock - * held EXCL with a SHARED request but not vice versa. Also, if - * the inode is already a part of the transaction then you cannot - * go from not holding the io lock to having it EXCL or SHARED. - * - * Use the inode cache routine xfs_inode_incore() to find the inode - * if it is already owned by this transaction. - * - * If we don't already own the inode, use xfs_iget() to get it. - * Since the inode log item structure is embedded in the incore - * inode structure and is initialized when the inode is brought - * into memory, there is nothing to do with it here. - * - * If the given transaction pointer is NULL, just call xfs_iget(). - * This simplifies code which must handle both cases. + * Get an inode and join it to the transaction. */ int xfs_trans_iget( @@ -84,62 +61,11 @@ xfs_trans_iget( xfs_inode_t **ipp) { int error; - xfs_inode_t *ip; - - /* - * If the transaction pointer is NULL, just call the normal - * xfs_iget(). - */ - if (tp == NULL) - return xfs_iget(mp, NULL, ino, flags, lock_flags, ipp, 0); - - /* - * If we find the inode in core with this transaction - * pointer in its i_transp field, then we know we already - * have it locked. In this case we just increment the lock - * recursion count and return the inode to the caller. - * Assert that the inode is already locked in the mode requested - * by the caller. We cannot do lock promotions yet, so - * die if someone gets this wrong. - */ - if ((ip = xfs_inode_incore(tp->t_mountp, ino, tp)) != NULL) { - /* - * Make sure that the inode lock is held EXCL and - * that the io lock is never upgraded when the inode - * is already a part of the transaction. - */ - ASSERT(ip->i_itemp != NULL); - ASSERT(lock_flags & XFS_ILOCK_EXCL); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) || - xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) || - (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_EXCL)); - ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) || - xfs_isilocked(ip, XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)); - ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) || - (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_ANY)); - - if (lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) { - ip->i_itemp->ili_iolock_recur++; - } - if (lock_flags & XFS_ILOCK_EXCL) { - ip->i_itemp->ili_ilock_recur++; - } - *ipp = ip; - return 0; - } - - ASSERT(lock_flags & XFS_ILOCK_EXCL); - error = xfs_iget(tp->t_mountp, tp, ino, flags, lock_flags, &ip, 0); - if (error) { - return error; - } - ASSERT(ip != NULL); - xfs_trans_ijoin(tp, ip, lock_flags); - *ipp = ip; - return 0; + error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp, 0); + if (!error && tp) + xfs_trans_ijoin(tp, *ipp, lock_flags); + return error; } /* @@ -163,8 +89,6 @@ xfs_trans_ijoin( xfs_inode_item_init(ip, ip->i_mount); iip = ip->i_itemp; ASSERT(iip->ili_flags == 0); - ASSERT(iip->ili_ilock_recur == 0); - ASSERT(iip->ili_iolock_recur == 0); /* * Get a log_item_desc to point at the new item. diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index c4eca5ed5dab..a434f287962d 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -538,7 +538,9 @@ xfs_readlink_bmap( d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0); + bp = xfs_buf_read_flags(mp->m_ddev_targp, d, BTOBB(byte_cnt), + XBF_LOCK | XBF_MAPPED | + XBF_DONT_BLOCK); error = XFS_BUF_GETERROR(bp); if (error) { xfs_ioerror_alert("xfs_readlink", @@ -609,7 +611,7 @@ xfs_fsync( xfs_inode_t *ip) { xfs_trans_t *tp; - int error; + int error = 0; int log_flushed = 0, changed = 1; xfs_itrace_entry(ip); @@ -617,14 +619,9 @@ xfs_fsync( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return XFS_ERROR(EIO); - /* capture size updates in I/O completion before writing the inode. */ - error = xfs_wait_on_pages(ip, 0, -1); - if (error) - return XFS_ERROR(error); - /* * We always need to make sure that the required inode state is safe on - * disk. The vnode might be clean but we still might need to force the + * disk. The inode might be clean but we still might need to force the * log because of committed transactions that haven't hit the disk yet. * Likewise, there could be unflushed non-transactional changes to the * inode core that have to go to disk and this requires us to issue @@ -636,7 +633,7 @@ xfs_fsync( */ xfs_ilock(ip, XFS_ILOCK_SHARED); - if (!(ip->i_update_size || ip->i_update_core)) { + if (!ip->i_update_core) { /* * Timestamps/size haven't changed since last inode flush or * inode transaction commit. That means either nothing got @@ -716,7 +713,7 @@ xfs_fsync( * when the link count isn't zero and by xfs_dm_punch_hole() when * punching a hole to EOF. */ -int +STATIC int xfs_free_eofblocks( xfs_mount_t *mp, xfs_inode_t *ip, @@ -1474,8 +1471,8 @@ xfs_create( if (error == ENOSPC) { /* flush outstanding delalloc blocks and retry */ xfs_flush_inodes(dp); - error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT); + error = xfs_trans_reserve(tp, resblks, log_res, 0, + XFS_TRANS_PERM_LOG_RES, log_count); } if (error == ENOSPC) { /* No space at all so try a "no-allocation" reservation */ |