diff options
Diffstat (limited to 'fs')
392 files changed, 12787 insertions, 9709 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 0ad61c6a65a5..055562c580b4 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -33,6 +33,7 @@ #include <linux/pagemap.h> #include <linux/idr.h> #include <linux/sched.h> +#include <linux/aio.h> #include <net/9p/9p.h> #include <net/9p/client.h> diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 91dad63e5a2d..2756dcd5de6e 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -365,3 +365,4 @@ struct file_system_type v9fs_fs_type = { .owner = THIS_MODULE, .fs_flags = FS_RENAME_DOES_D_MOVE, }; +MODULE_ALIAS_FS("9p"); diff --git a/fs/Kconfig b/fs/Kconfig index 780725a463b1..c229f828eb01 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -211,6 +211,7 @@ source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" source "fs/exofs/Kconfig" source "fs/f2fs/Kconfig" +source "fs/efivarfs/Kconfig" endif # MISC_FILESYSTEMS diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 0efd1524b977..370b24cee4d8 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -65,6 +65,20 @@ config CORE_DUMP_DEFAULT_ELF_HEADERS This config option changes the default setting of coredump_filter seen at boot time. If unsure, say Y. +config BINFMT_SCRIPT + tristate "Kernel support for scripts starting with #!" + default y + help + Say Y here if you want to execute interpreted scripts starting with + #! followed by the path to an interpreter. + + You can build this support as a module; however, until that module + gets loaded, you cannot run scripts. Thus, if you want to load this + module from an initramfs, the portion of the initramfs before loading + this module must consist of compiled binaries only. + + Most systems will not boot if you say M or N here. If unsure, say Y. + config BINFMT_FLAT bool "Kernel support for flat binaries" depends on !MMU && (!FRV || BROKEN) diff --git a/fs/Makefile b/fs/Makefile index 9d53192236fc..4fe6df3ec28f 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -7,10 +7,10 @@ obj-y := open.o read_write.o file_table.o super.o \ char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \ - ioctl.o readdir.o select.o fifo.o dcache.o inode.o \ + ioctl.o readdir.o select.o dcache.o inode.o \ attr.o bad_inode.o file.o filesystems.o namespace.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ - pnode.o drop_caches.o splice.o sync.o utimes.o \ + pnode.o splice.o sync.o utimes.o \ stack.o fs_struct.o statfs.o ifeq ($(CONFIG_BLOCK),y) @@ -34,10 +34,7 @@ obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o - -# binfmt_script is always there -obj-y += binfmt_script.o - +obj-$(CONFIG_BINFMT_SCRIPT) += binfmt_script.o obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o @@ -49,6 +46,7 @@ obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o obj-$(CONFIG_NFS_COMMON) += nfs_common/ obj-$(CONFIG_GENERIC_ACL) += generic_acl.o obj-$(CONFIG_COREDUMP) += coredump.o +obj-$(CONFIG_SYSCTL) += drop_caches.o obj-$(CONFIG_FHANDLE) += fhandle.o @@ -127,3 +125,4 @@ obj-$(CONFIG_F2FS_FS) += f2fs/ obj-y += exofs/ # Multiple modules obj-$(CONFIG_CEPH_FS) += ceph/ obj-$(CONFIG_PSTORE) += pstore/ +obj-$(CONFIG_EFIVAR_FS) += efivarfs/ diff --git a/fs/adfs/super.c b/fs/adfs/super.c index d57122935793..0ff4bae2c2a2 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -524,6 +524,7 @@ static struct file_system_type adfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("adfs"); static int __init init_adfs_fs(void) { diff --git a/fs/affs/super.c b/fs/affs/super.c index b84dc7352502..45161a832bbc 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -622,6 +622,7 @@ static struct file_system_type affs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("affs"); static int __init init_affs_fs(void) { diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 096b23f821a1..526e4bbbde59 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -190,7 +190,7 @@ static int afs_proc_cells_open(struct inode *inode, struct file *file) return ret; m = file->private_data; - m->private = PDE(inode)->data; + m->private = PDE_DATA(inode); return 0; } @@ -448,7 +448,7 @@ static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file) struct seq_file *m; int ret; - cell = PDE(inode)->data; + cell = PDE_DATA(inode); if (!cell) return -ENOENT; @@ -554,7 +554,7 @@ static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file) struct seq_file *m; int ret; - cell = PDE(inode)->data; + cell = PDE_DATA(inode); if (!cell) return -ENOENT; @@ -659,7 +659,7 @@ static int afs_proc_cell_servers_open(struct inode *inode, struct file *file) struct seq_file *m; int ret; - cell = PDE(inode)->data; + cell = PDE_DATA(inode); if (!cell) return -ENOENT; diff --git a/fs/afs/super.c b/fs/afs/super.c index 7c31ec399575..c4861557e385 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -45,6 +45,7 @@ struct file_system_type afs_fs_type = { .kill_sb = afs_kill_super, .fs_flags = 0, }; +MODULE_ALIAS_FS("afs"); static const struct super_operations afs_super_ops = { .statfs = afs_statfs, diff --git a/fs/afs/write.c b/fs/afs/write.c index 7e03eadb40c0..a890db4b9898 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -14,6 +14,7 @@ #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/pagevec.h> +#include <linux/aio.h> #include "internal.h" static int afs_write_back_from_locked_page(struct afs_writeback *wb, @@ -8,6 +8,8 @@ * * See ../COPYING for licensing terms. */ +#define pr_fmt(fmt) "%s: " fmt, __func__ + #include <linux/kernel.h> #include <linux/init.h> #include <linux/errno.h> @@ -18,8 +20,6 @@ #include <linux/backing-dev.h> #include <linux/uio.h> -#define DEBUG 0 - #include <linux/sched.h> #include <linux/fs.h> #include <linux/file.h> @@ -39,11 +39,76 @@ #include <asm/kmap_types.h> #include <asm/uaccess.h> -#if DEBUG > 1 -#define dprintk printk -#else -#define dprintk(x...) do { ; } while (0) -#endif +#define AIO_RING_MAGIC 0xa10a10a1 +#define AIO_RING_COMPAT_FEATURES 1 +#define AIO_RING_INCOMPAT_FEATURES 0 +struct aio_ring { + unsigned id; /* kernel internal index number */ + unsigned nr; /* number of io_events */ + unsigned head; + unsigned tail; + + unsigned magic; + unsigned compat_features; + unsigned incompat_features; + unsigned header_length; /* size of aio_ring */ + + + struct io_event io_events[0]; +}; /* 128 bytes + ring size */ + +#define AIO_RING_PAGES 8 + +struct kioctx { + atomic_t users; + atomic_t dead; + + /* This needs improving */ + unsigned long user_id; + struct hlist_node list; + + /* + * This is what userspace passed to io_setup(), it's not used for + * anything but counting against the global max_reqs quota. + * + * The real limit is nr_events - 1, which will be larger (see + * aio_setup_ring()) + */ + unsigned max_reqs; + + /* Size of ringbuffer, in units of struct io_event */ + unsigned nr_events; + + unsigned long mmap_base; + unsigned long mmap_size; + + struct page **ring_pages; + long nr_pages; + + struct rcu_head rcu_head; + struct work_struct rcu_work; + + struct { + atomic_t reqs_active; + } ____cacheline_aligned_in_smp; + + struct { + spinlock_t ctx_lock; + struct list_head active_reqs; /* used for cancellation */ + } ____cacheline_aligned_in_smp; + + struct { + struct mutex ring_lock; + wait_queue_head_t wait; + } ____cacheline_aligned_in_smp; + + struct { + unsigned tail; + spinlock_t completion_lock; + } ____cacheline_aligned_in_smp; + + struct page *internal_pages[AIO_RING_PAGES]; +}; /*------ sysctl variables----*/ static DEFINE_SPINLOCK(aio_nr_lock); @@ -54,11 +119,6 @@ unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio request static struct kmem_cache *kiocb_cachep; static struct kmem_cache *kioctx_cachep; -static struct workqueue_struct *aio_wq; - -static void aio_kick_handler(struct work_struct *); -static void aio_queue_work(struct kioctx *); - /* aio_setup * Creates the slab caches used by the aio routines, panic on * failure as this is done early during the boot sequence. @@ -68,10 +128,7 @@ static int __init aio_setup(void) kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); - aio_wq = alloc_workqueue("aio", 0, 1); /* used to limit concurrency */ - BUG_ON(!aio_wq); - - pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); + pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page)); return 0; } @@ -79,28 +136,23 @@ __initcall(aio_setup); static void aio_free_ring(struct kioctx *ctx) { - struct aio_ring_info *info = &ctx->ring_info; long i; - for (i=0; i<info->nr_pages; i++) - put_page(info->ring_pages[i]); + for (i = 0; i < ctx->nr_pages; i++) + put_page(ctx->ring_pages[i]); - if (info->mmap_size) { - BUG_ON(ctx->mm != current->mm); - vm_munmap(info->mmap_base, info->mmap_size); - } + if (ctx->mmap_size) + vm_munmap(ctx->mmap_base, ctx->mmap_size); - if (info->ring_pages && info->ring_pages != info->internal_pages) - kfree(info->ring_pages); - info->ring_pages = NULL; - info->nr = 0; + if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) + kfree(ctx->ring_pages); } static int aio_setup_ring(struct kioctx *ctx) { struct aio_ring *ring; - struct aio_ring_info *info = &ctx->ring_info; unsigned nr_events = ctx->max_reqs; + struct mm_struct *mm = current->mm; unsigned long size, populate; int nr_pages; @@ -116,46 +168,44 @@ static int aio_setup_ring(struct kioctx *ctx) nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); - info->nr = 0; - info->ring_pages = info->internal_pages; + ctx->nr_events = 0; + ctx->ring_pages = ctx->internal_pages; if (nr_pages > AIO_RING_PAGES) { - info->ring_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); - if (!info->ring_pages) + ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), + GFP_KERNEL); + if (!ctx->ring_pages) return -ENOMEM; } - info->mmap_size = nr_pages * PAGE_SIZE; - dprintk("attempting mmap of %lu bytes\n", info->mmap_size); - down_write(&ctx->mm->mmap_sem); - info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, - PROT_READ|PROT_WRITE, - MAP_ANONYMOUS|MAP_PRIVATE, 0, - &populate); - if (IS_ERR((void *)info->mmap_base)) { - up_write(&ctx->mm->mmap_sem); - info->mmap_size = 0; + ctx->mmap_size = nr_pages * PAGE_SIZE; + pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size); + down_write(&mm->mmap_sem); + ctx->mmap_base = do_mmap_pgoff(NULL, 0, ctx->mmap_size, + PROT_READ|PROT_WRITE, + MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate); + if (IS_ERR((void *)ctx->mmap_base)) { + up_write(&mm->mmap_sem); + ctx->mmap_size = 0; aio_free_ring(ctx); return -EAGAIN; } - dprintk("mmap address: 0x%08lx\n", info->mmap_base); - info->nr_pages = get_user_pages(current, ctx->mm, - info->mmap_base, nr_pages, - 1, 0, info->ring_pages, NULL); - up_write(&ctx->mm->mmap_sem); + pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); + ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages, + 1, 0, ctx->ring_pages, NULL); + up_write(&mm->mmap_sem); - if (unlikely(info->nr_pages != nr_pages)) { + if (unlikely(ctx->nr_pages != nr_pages)) { aio_free_ring(ctx); return -EAGAIN; } if (populate) - mm_populate(info->mmap_base, populate); + mm_populate(ctx->mmap_base, populate); - ctx->user_id = info->mmap_base; + ctx->user_id = ctx->mmap_base; + ctx->nr_events = nr_events; /* trusted copy */ - info->nr = nr_events; /* trusted copy */ - - ring = kmap_atomic(info->ring_pages[0]); + ring = kmap_atomic(ctx->ring_pages[0]); ring->nr = nr_events; /* user copy */ ring->id = ctx->user_id; ring->head = ring->tail = 0; @@ -164,72 +214,133 @@ static int aio_setup_ring(struct kioctx *ctx) ring->incompat_features = AIO_RING_INCOMPAT_FEATURES; ring->header_length = sizeof(struct aio_ring); kunmap_atomic(ring); + flush_dcache_page(ctx->ring_pages[0]); return 0; } - -/* aio_ring_event: returns a pointer to the event at the given index from - * kmap_atomic(). Release the pointer with put_aio_ring_event(); - */ #define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) -#define aio_ring_event(info, nr) ({ \ - unsigned pos = (nr) + AIO_EVENTS_OFFSET; \ - struct io_event *__event; \ - __event = kmap_atomic( \ - (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE]); \ - __event += pos % AIO_EVENTS_PER_PAGE; \ - __event; \ -}) - -#define put_aio_ring_event(event) do { \ - struct io_event *__event = (event); \ - (void)__event; \ - kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK)); \ -} while(0) - -static void ctx_rcu_free(struct rcu_head *head) +void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel) +{ + struct kioctx *ctx = req->ki_ctx; + unsigned long flags; + + spin_lock_irqsave(&ctx->ctx_lock, flags); + + if (!req->ki_list.next) + list_add(&req->ki_list, &ctx->active_reqs); + + req->ki_cancel = cancel; + + spin_unlock_irqrestore(&ctx->ctx_lock, flags); +} +EXPORT_SYMBOL(kiocb_set_cancel_fn); + +static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb, + struct io_event *res) +{ + kiocb_cancel_fn *old, *cancel; + int ret = -EINVAL; + + /* + * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it + * actually has a cancel function, hence the cmpxchg() + */ + + cancel = ACCESS_ONCE(kiocb->ki_cancel); + do { + if (!cancel || cancel == KIOCB_CANCELLED) + return ret; + + old = cancel; + cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED); + } while (cancel != old); + + atomic_inc(&kiocb->ki_users); + spin_unlock_irq(&ctx->ctx_lock); + + memset(res, 0, sizeof(*res)); + res->obj = (u64)(unsigned long)kiocb->ki_obj.user; + res->data = kiocb->ki_user_data; + ret = cancel(kiocb, res); + + spin_lock_irq(&ctx->ctx_lock); + + return ret; +} + +static void free_ioctx_rcu(struct rcu_head *head) { struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); kmem_cache_free(kioctx_cachep, ctx); } -/* __put_ioctx - * Called when the last user of an aio context has gone away, - * and the struct needs to be freed. +/* + * When this function runs, the kioctx has been removed from the "hash table" + * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted - + * now it's safe to cancel any that need to be. */ -static void __put_ioctx(struct kioctx *ctx) +static void free_ioctx(struct kioctx *ctx) { - unsigned nr_events = ctx->max_reqs; - BUG_ON(ctx->reqs_active); + struct aio_ring *ring; + struct io_event res; + struct kiocb *req; + unsigned head, avail; - cancel_delayed_work_sync(&ctx->wq); - aio_free_ring(ctx); - mmdrop(ctx->mm); - ctx->mm = NULL; - if (nr_events) { - spin_lock(&aio_nr_lock); - BUG_ON(aio_nr - nr_events > aio_nr); - aio_nr -= nr_events; - spin_unlock(&aio_nr_lock); + spin_lock_irq(&ctx->ctx_lock); + + while (!list_empty(&ctx->active_reqs)) { + req = list_first_entry(&ctx->active_reqs, + struct kiocb, ki_list); + + list_del_init(&req->ki_list); + kiocb_cancel(ctx, req, &res); } - pr_debug("__put_ioctx: freeing %p\n", ctx); - call_rcu(&ctx->rcu_head, ctx_rcu_free); -} -static inline int try_get_ioctx(struct kioctx *kioctx) -{ - return atomic_inc_not_zero(&kioctx->users); + spin_unlock_irq(&ctx->ctx_lock); + + ring = kmap_atomic(ctx->ring_pages[0]); + head = ring->head; + kunmap_atomic(ring); + + while (atomic_read(&ctx->reqs_active) > 0) { + wait_event(ctx->wait, head != ctx->tail); + + avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; + + atomic_sub(avail, &ctx->reqs_active); + head += avail; + head %= ctx->nr_events; + } + + WARN_ON(atomic_read(&ctx->reqs_active) < 0); + + aio_free_ring(ctx); + + spin_lock(&aio_nr_lock); + BUG_ON(aio_nr - ctx->max_reqs > aio_nr); + aio_nr -= ctx->max_reqs; + spin_unlock(&aio_nr_lock); + + pr_debug("freeing %p\n", ctx); + + /* + * Here the call_rcu() is between the wait_event() for reqs_active to + * hit 0, and freeing the ioctx. + * + * aio_complete() decrements reqs_active, but it has to touch the ioctx + * after to issue a wakeup so we use rcu. + */ + call_rcu(&ctx->rcu_head, free_ioctx_rcu); } -static inline void put_ioctx(struct kioctx *kioctx) +static void put_ioctx(struct kioctx *ctx) { - BUG_ON(atomic_read(&kioctx->users) <= 0); - if (unlikely(atomic_dec_and_test(&kioctx->users))) - __put_ioctx(kioctx); + if (unlikely(atomic_dec_and_test(&ctx->users))) + free_ioctx(ctx); } /* ioctx_alloc @@ -237,7 +348,7 @@ static inline void put_ioctx(struct kioctx *kioctx) */ static struct kioctx *ioctx_alloc(unsigned nr_events) { - struct mm_struct *mm; + struct mm_struct *mm = current->mm; struct kioctx *ctx; int err = -ENOMEM; @@ -256,17 +367,15 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) return ERR_PTR(-ENOMEM); ctx->max_reqs = nr_events; - mm = ctx->mm = current->mm; - atomic_inc(&mm->mm_count); atomic_set(&ctx->users, 2); + atomic_set(&ctx->dead, 0); spin_lock_init(&ctx->ctx_lock); - spin_lock_init(&ctx->ring_info.ring_lock); + spin_lock_init(&ctx->completion_lock); + mutex_init(&ctx->ring_lock); init_waitqueue_head(&ctx->wait); INIT_LIST_HEAD(&ctx->active_reqs); - INIT_LIST_HEAD(&ctx->run_list); - INIT_DELAYED_WORK(&ctx->wq, aio_kick_handler); if (aio_setup_ring(ctx) < 0) goto out_freectx; @@ -286,64 +395,56 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) hlist_add_head_rcu(&ctx->list, &mm->ioctx_list); spin_unlock(&mm->ioctx_lock); - dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", - ctx, ctx->user_id, current->mm, ctx->ring_info.nr); + pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", + ctx, ctx->user_id, mm, ctx->nr_events); return ctx; out_cleanup: err = -EAGAIN; aio_free_ring(ctx); out_freectx: - mmdrop(mm); kmem_cache_free(kioctx_cachep, ctx); - dprintk("aio: error allocating ioctx %d\n", err); + pr_debug("error allocating ioctx %d\n", err); return ERR_PTR(err); } -/* kill_ctx - * Cancels all outstanding aio requests on an aio context. Used - * when the processes owning a context have all exited to encourage - * the rapid destruction of the kioctx. - */ -static void kill_ctx(struct kioctx *ctx) +static void kill_ioctx_work(struct work_struct *work) { - int (*cancel)(struct kiocb *, struct io_event *); - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); - struct io_event res; + struct kioctx *ctx = container_of(work, struct kioctx, rcu_work); - spin_lock_irq(&ctx->ctx_lock); - ctx->dead = 1; - while (!list_empty(&ctx->active_reqs)) { - struct list_head *pos = ctx->active_reqs.next; - struct kiocb *iocb = list_kiocb(pos); - list_del_init(&iocb->ki_list); - cancel = iocb->ki_cancel; - kiocbSetCancelled(iocb); - if (cancel) { - iocb->ki_users++; - spin_unlock_irq(&ctx->ctx_lock); - cancel(iocb, &res); - spin_lock_irq(&ctx->ctx_lock); - } - } + wake_up_all(&ctx->wait); + put_ioctx(ctx); +} - if (!ctx->reqs_active) - goto out; +static void kill_ioctx_rcu(struct rcu_head *head) +{ + struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); - add_wait_queue(&ctx->wait, &wait); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - while (ctx->reqs_active) { - spin_unlock_irq(&ctx->ctx_lock); - io_schedule(); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - spin_lock_irq(&ctx->ctx_lock); - } - __set_task_state(tsk, TASK_RUNNING); - remove_wait_queue(&ctx->wait, &wait); + INIT_WORK(&ctx->rcu_work, kill_ioctx_work); + schedule_work(&ctx->rcu_work); +} -out: - spin_unlock_irq(&ctx->ctx_lock); +/* kill_ioctx + * Cancels all outstanding aio requests on an aio context. Used + * when the processes owning a context have all exited to encourage + * the rapid destruction of the kioctx. + */ +static void kill_ioctx(struct kioctx *ctx) +{ + if (!atomic_xchg(&ctx->dead, 1)) { + hlist_del_rcu(&ctx->list); + /* Between hlist_del_rcu() and dropping the initial ref */ + synchronize_rcu(); + + /* + * We can't punt to workqueue here because put_ioctx() -> + * free_ioctx() will unmap the ringbuffer, and that has to be + * done in the original process's context. kill_ioctx_rcu/work() + * exist for exit_aio(), as in that path free_ioctx() won't do + * the unmap. + */ + kill_ioctx_work(&ctx->rcu_work); + } } /* wait_on_sync_kiocb: @@ -351,9 +452,9 @@ out: */ ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { - while (iocb->ki_users) { + while (atomic_read(&iocb->ki_users)) { set_current_state(TASK_UNINTERRUPTIBLE); - if (!iocb->ki_users) + if (!atomic_read(&iocb->ki_users)) break; io_schedule(); } @@ -362,28 +463,26 @@ ssize_t wait_on_sync_kiocb(struct kiocb *iocb) } EXPORT_SYMBOL(wait_on_sync_kiocb); -/* exit_aio: called when the last user of mm goes away. At this point, - * there is no way for any new requests to be submited or any of the - * io_* syscalls to be called on the context. However, there may be - * outstanding requests which hold references to the context; as they - * go away, they will call put_ioctx and release any pinned memory - * associated with the request (held via struct page * references). +/* + * exit_aio: called when the last user of mm goes away. At this point, there is + * no way for any new requests to be submited or any of the io_* syscalls to be + * called on the context. + * + * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on + * them. */ void exit_aio(struct mm_struct *mm) { struct kioctx *ctx; + struct hlist_node *n; - while (!hlist_empty(&mm->ioctx_list)) { - ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list); - hlist_del_rcu(&ctx->list); - - kill_ctx(ctx); - + hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) { if (1 != atomic_read(&ctx->users)) printk(KERN_DEBUG "exit_aio:ioctx still alive: %d %d %d\n", - atomic_read(&ctx->users), ctx->dead, - ctx->reqs_active); + atomic_read(&ctx->users), + atomic_read(&ctx->dead), + atomic_read(&ctx->reqs_active)); /* * We don't need to bother with munmap() here - * exit_mmap(mm) is coming and it'll unmap everything. @@ -391,150 +490,53 @@ void exit_aio(struct mm_struct *mm) * as indicator that it needs to unmap the area, * just set it to 0; aio_free_ring() is the only * place that uses ->mmap_size, so it's safe. - * That way we get all munmap done to current->mm - - * all other callers have ctx->mm == current->mm. */ - ctx->ring_info.mmap_size = 0; - put_ioctx(ctx); + ctx->mmap_size = 0; + + if (!atomic_xchg(&ctx->dead, 1)) { + hlist_del_rcu(&ctx->list); + call_rcu(&ctx->rcu_head, kill_ioctx_rcu); + } } } /* aio_get_req - * Allocate a slot for an aio request. Increments the users count + * Allocate a slot for an aio request. Increments the ki_users count * of the kioctx so that the kioctx stays around until all requests are * complete. Returns NULL if no requests are free. * - * Returns with kiocb->users set to 2. The io submit code path holds + * Returns with kiocb->ki_users set to 2. The io submit code path holds * an extra reference while submitting the i/o. * This prevents races between the aio code path referencing the * req (after submitting it) and aio_complete() freeing the req. */ -static struct kiocb *__aio_get_req(struct kioctx *ctx) +static inline struct kiocb *aio_get_req(struct kioctx *ctx) { - struct kiocb *req = NULL; + struct kiocb *req; - req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL); - if (unlikely(!req)) + if (atomic_read(&ctx->reqs_active) >= ctx->nr_events) return NULL; - req->ki_flags = 0; - req->ki_users = 2; - req->ki_key = 0; - req->ki_ctx = ctx; - req->ki_cancel = NULL; - req->ki_retry = NULL; - req->ki_dtor = NULL; - req->private = NULL; - req->ki_iovec = NULL; - INIT_LIST_HEAD(&req->ki_run_list); - req->ki_eventfd = NULL; - - return req; -} - -/* - * struct kiocb's are allocated in batches to reduce the number of - * times the ctx lock is acquired and released. - */ -#define KIOCB_BATCH_SIZE 32L -struct kiocb_batch { - struct list_head head; - long count; /* number of requests left to allocate */ -}; - -static void kiocb_batch_init(struct kiocb_batch *batch, long total) -{ - INIT_LIST_HEAD(&batch->head); - batch->count = total; -} - -static void kiocb_batch_free(struct kioctx *ctx, struct kiocb_batch *batch) -{ - struct kiocb *req, *n; - - if (list_empty(&batch->head)) - return; - - spin_lock_irq(&ctx->ctx_lock); - list_for_each_entry_safe(req, n, &batch->head, ki_batch) { - list_del(&req->ki_batch); - list_del(&req->ki_list); - kmem_cache_free(kiocb_cachep, req); - ctx->reqs_active--; - } - if (unlikely(!ctx->reqs_active && ctx->dead)) - wake_up_all(&ctx->wait); - spin_unlock_irq(&ctx->ctx_lock); -} - -/* - * Allocate a batch of kiocbs. This avoids taking and dropping the - * context lock a lot during setup. - */ -static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch) -{ - unsigned short allocated, to_alloc; - long avail; - struct kiocb *req, *n; - struct aio_ring *ring; - - to_alloc = min(batch->count, KIOCB_BATCH_SIZE); - for (allocated = 0; allocated < to_alloc; allocated++) { - req = __aio_get_req(ctx); - if (!req) - /* allocation failed, go with what we've got */ - break; - list_add(&req->ki_batch, &batch->head); - } - - if (allocated == 0) - goto out; - - spin_lock_irq(&ctx->ctx_lock); - ring = kmap_atomic(ctx->ring_info.ring_pages[0]); - - avail = aio_ring_avail(&ctx->ring_info, ring) - ctx->reqs_active; - BUG_ON(avail < 0); - if (avail < allocated) { - /* Trim back the number of requests. */ - list_for_each_entry_safe(req, n, &batch->head, ki_batch) { - list_del(&req->ki_batch); - kmem_cache_free(kiocb_cachep, req); - if (--allocated <= avail) - break; - } - } - - batch->count -= allocated; - list_for_each_entry(req, &batch->head, ki_batch) { - list_add(&req->ki_list, &ctx->active_reqs); - ctx->reqs_active++; - } - - kunmap_atomic(ring); - spin_unlock_irq(&ctx->ctx_lock); + if (atomic_inc_return(&ctx->reqs_active) > ctx->nr_events - 1) + goto out_put; -out: - return allocated; -} + req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); + if (unlikely(!req)) + goto out_put; -static inline struct kiocb *aio_get_req(struct kioctx *ctx, - struct kiocb_batch *batch) -{ - struct kiocb *req; + atomic_set(&req->ki_users, 2); + req->ki_ctx = ctx; - if (list_empty(&batch->head)) - if (kiocb_batch_refill(ctx, batch) == 0) - return NULL; - req = list_first_entry(&batch->head, struct kiocb, ki_batch); - list_del(&req->ki_batch); return req; +out_put: + atomic_dec(&ctx->reqs_active); + return NULL; } -static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) +static void kiocb_free(struct kiocb *req) { - assert_spin_locked(&ctx->ctx_lock); - + if (req->ki_filp) + fput(req->ki_filp); if (req->ki_eventfd != NULL) eventfd_ctx_put(req->ki_eventfd); if (req->ki_dtor) @@ -542,48 +544,12 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) if (req->ki_iovec != &req->ki_inline_vec) kfree(req->ki_iovec); kmem_cache_free(kiocb_cachep, req); - ctx->reqs_active--; - - if (unlikely(!ctx->reqs_active && ctx->dead)) - wake_up_all(&ctx->wait); } -/* __aio_put_req - * Returns true if this put was the last user of the request. - */ -static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) +void aio_put_req(struct kiocb *req) { - dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n", - req, atomic_long_read(&req->ki_filp->f_count)); - - assert_spin_locked(&ctx->ctx_lock); - - req->ki_users--; - BUG_ON(req->ki_users < 0); - if (likely(req->ki_users)) - return 0; - list_del(&req->ki_list); /* remove from active_reqs */ - req->ki_cancel = NULL; - req->ki_retry = NULL; - - fput(req->ki_filp); - req->ki_filp = NULL; - really_put_req(ctx, req); - return 1; -} - -/* aio_put_req - * Returns true if this put was the last user of the kiocb, - * false if the request is still in use. - */ -int aio_put_req(struct kiocb *req) -{ - struct kioctx *ctx = req->ki_ctx; - int ret; - spin_lock_irq(&ctx->ctx_lock); - ret = __aio_put_req(ctx, req); - spin_unlock_irq(&ctx->ctx_lock); - return ret; + if (atomic_dec_and_test(&req->ki_users)) + kiocb_free(req); } EXPORT_SYMBOL(aio_put_req); @@ -595,13 +561,8 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) rcu_read_lock(); hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) { - /* - * RCU protects us against accessing freed memory but - * we have to be careful not to get a reference when the - * reference count already dropped to 0 (ctx->dead test - * is unreliable because of races). - */ - if (ctx->user_id == ctx_id && !ctx->dead && try_get_ioctx(ctx)){ + if (ctx->user_id == ctx_id) { + atomic_inc(&ctx->users); ret = ctx; break; } @@ -611,295 +572,16 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) return ret; } -/* - * Queue up a kiocb to be retried. Assumes that the kiocb - * has already been marked as kicked, and places it on - * the retry run list for the corresponding ioctx, if it - * isn't already queued. Returns 1 if it actually queued - * the kiocb (to tell the caller to activate the work - * queue to process it), or 0, if it found that it was - * already queued. - */ -static inline int __queue_kicked_iocb(struct kiocb *iocb) -{ - struct kioctx *ctx = iocb->ki_ctx; - - assert_spin_locked(&ctx->ctx_lock); - - if (list_empty(&iocb->ki_run_list)) { - list_add_tail(&iocb->ki_run_list, - &ctx->run_list); - return 1; - } - return 0; -} - -/* aio_run_iocb - * This is the core aio execution routine. It is - * invoked both for initial i/o submission and - * subsequent retries via the aio_kick_handler. - * Expects to be invoked with iocb->ki_ctx->lock - * already held. The lock is released and reacquired - * as needed during processing. - * - * Calls the iocb retry method (already setup for the - * iocb on initial submission) for operation specific - * handling, but takes care of most of common retry - * execution details for a given iocb. The retry method - * needs to be non-blocking as far as possible, to avoid - * holding up other iocbs waiting to be serviced by the - * retry kernel thread. - * - * The trickier parts in this code have to do with - * ensuring that only one retry instance is in progress - * for a given iocb at any time. Providing that guarantee - * simplifies the coding of individual aio operations as - * it avoids various potential races. - */ -static ssize_t aio_run_iocb(struct kiocb *iocb) -{ - struct kioctx *ctx = iocb->ki_ctx; - ssize_t (*retry)(struct kiocb *); - ssize_t ret; - - if (!(retry = iocb->ki_retry)) { - printk("aio_run_iocb: iocb->ki_retry = NULL\n"); - return 0; - } - - /* - * We don't want the next retry iteration for this - * operation to start until this one has returned and - * updated the iocb state. However, wait_queue functions - * can trigger a kick_iocb from interrupt context in the - * meantime, indicating that data is available for the next - * iteration. We want to remember that and enable the - * next retry iteration _after_ we are through with - * this one. - * - * So, in order to be able to register a "kick", but - * prevent it from being queued now, we clear the kick - * flag, but make the kick code *think* that the iocb is - * still on the run list until we are actually done. - * When we are done with this iteration, we check if - * the iocb was kicked in the meantime and if so, queue - * it up afresh. - */ - - kiocbClearKicked(iocb); - - /* - * This is so that aio_complete knows it doesn't need to - * pull the iocb off the run list (We can't just call - * INIT_LIST_HEAD because we don't want a kick_iocb to - * queue this on the run list yet) - */ - iocb->ki_run_list.next = iocb->ki_run_list.prev = NULL; - spin_unlock_irq(&ctx->ctx_lock); - - /* Quit retrying if the i/o has been cancelled */ - if (kiocbIsCancelled(iocb)) { - ret = -EINTR; - aio_complete(iocb, ret, 0); - /* must not access the iocb after this */ - goto out; - } - - /* - * Now we are all set to call the retry method in async - * context. - */ - ret = retry(iocb); - - if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) { - /* - * There's no easy way to restart the syscall since other AIO's - * may be already running. Just fail this IO with EINTR. - */ - if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR || - ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK)) - ret = -EINTR; - aio_complete(iocb, ret, 0); - } -out: - spin_lock_irq(&ctx->ctx_lock); - - if (-EIOCBRETRY == ret) { - /* - * OK, now that we are done with this iteration - * and know that there is more left to go, - * this is where we let go so that a subsequent - * "kick" can start the next iteration - */ - - /* will make __queue_kicked_iocb succeed from here on */ - INIT_LIST_HEAD(&iocb->ki_run_list); - /* we must queue the next iteration ourselves, if it - * has already been kicked */ - if (kiocbIsKicked(iocb)) { - __queue_kicked_iocb(iocb); - - /* - * __queue_kicked_iocb will always return 1 here, because - * iocb->ki_run_list is empty at this point so it should - * be safe to unconditionally queue the context into the - * work queue. - */ - aio_queue_work(ctx); - } - } - return ret; -} - -/* - * __aio_run_iocbs: - * Process all pending retries queued on the ioctx - * run list. - * Assumes it is operating within the aio issuer's mm - * context. - */ -static int __aio_run_iocbs(struct kioctx *ctx) -{ - struct kiocb *iocb; - struct list_head run_list; - - assert_spin_locked(&ctx->ctx_lock); - - list_replace_init(&ctx->run_list, &run_list); - while (!list_empty(&run_list)) { - iocb = list_entry(run_list.next, struct kiocb, - ki_run_list); - list_del(&iocb->ki_run_list); - /* - * Hold an extra reference while retrying i/o. - */ - iocb->ki_users++; /* grab extra reference */ - aio_run_iocb(iocb); - __aio_put_req(ctx, iocb); - } - if (!list_empty(&ctx->run_list)) - return 1; - return 0; -} - -static void aio_queue_work(struct kioctx * ctx) -{ - unsigned long timeout; - /* - * if someone is waiting, get the work started right - * away, otherwise, use a longer delay - */ - smp_mb(); - if (waitqueue_active(&ctx->wait)) - timeout = 1; - else - timeout = HZ/10; - queue_delayed_work(aio_wq, &ctx->wq, timeout); -} - -/* - * aio_run_all_iocbs: - * Process all pending retries queued on the ioctx - * run list, and keep running them until the list - * stays empty. - * Assumes it is operating within the aio issuer's mm context. - */ -static inline void aio_run_all_iocbs(struct kioctx *ctx) -{ - spin_lock_irq(&ctx->ctx_lock); - while (__aio_run_iocbs(ctx)) - ; - spin_unlock_irq(&ctx->ctx_lock); -} - -/* - * aio_kick_handler: - * Work queue handler triggered to process pending - * retries on an ioctx. Takes on the aio issuer's - * mm context before running the iocbs, so that - * copy_xxx_user operates on the issuer's address - * space. - * Run on aiod's context. - */ -static void aio_kick_handler(struct work_struct *work) -{ - struct kioctx *ctx = container_of(work, struct kioctx, wq.work); - mm_segment_t oldfs = get_fs(); - struct mm_struct *mm; - int requeue; - - set_fs(USER_DS); - use_mm(ctx->mm); - spin_lock_irq(&ctx->ctx_lock); - requeue =__aio_run_iocbs(ctx); - mm = ctx->mm; - spin_unlock_irq(&ctx->ctx_lock); - unuse_mm(mm); - set_fs(oldfs); - /* - * we're in a worker thread already; no point using non-zero delay - */ - if (requeue) - queue_delayed_work(aio_wq, &ctx->wq, 0); -} - - -/* - * Called by kick_iocb to queue the kiocb for retry - * and if required activate the aio work queue to process - * it - */ -static void try_queue_kicked_iocb(struct kiocb *iocb) -{ - struct kioctx *ctx = iocb->ki_ctx; - unsigned long flags; - int run = 0; - - spin_lock_irqsave(&ctx->ctx_lock, flags); - /* set this inside the lock so that we can't race with aio_run_iocb() - * testing it and putting the iocb on the run list under the lock */ - if (!kiocbTryKick(iocb)) - run = __queue_kicked_iocb(iocb); - spin_unlock_irqrestore(&ctx->ctx_lock, flags); - if (run) - aio_queue_work(ctx); -} - -/* - * kick_iocb: - * Called typically from a wait queue callback context - * to trigger a retry of the iocb. - * The retry is usually executed by aio workqueue - * threads (See aio_kick_handler). - */ -void kick_iocb(struct kiocb *iocb) -{ - /* sync iocbs are easy: they can only ever be executing from a - * single context. */ - if (is_sync_kiocb(iocb)) { - kiocbSetKicked(iocb); - wake_up_process(iocb->ki_obj.tsk); - return; - } - - try_queue_kicked_iocb(iocb); -} -EXPORT_SYMBOL(kick_iocb); - /* aio_complete * Called when the io request on the given iocb is complete. - * Returns true if this is the last user of the request. The - * only other user of the request can be the cancellation code. */ -int aio_complete(struct kiocb *iocb, long res, long res2) +void aio_complete(struct kiocb *iocb, long res, long res2) { struct kioctx *ctx = iocb->ki_ctx; - struct aio_ring_info *info; struct aio_ring *ring; - struct io_event *event; + struct io_event *ev_page, *event; unsigned long flags; - unsigned long tail; - int ret; + unsigned tail, pos; /* * Special case handling for sync iocbs: @@ -909,61 +591,81 @@ int aio_complete(struct kiocb *iocb, long res, long res2) * - the sync task helpfully left a reference to itself in the iocb */ if (is_sync_kiocb(iocb)) { - BUG_ON(iocb->ki_users != 1); + BUG_ON(atomic_read(&iocb->ki_users) != 1); iocb->ki_user_data = res; - iocb->ki_users = 0; + atomic_set(&iocb->ki_users, 0); wake_up_process(iocb->ki_obj.tsk); - return 1; + return; } - info = &ctx->ring_info; - - /* add a completion event to the ring buffer. - * must be done holding ctx->ctx_lock to prevent - * other code from messing with the tail - * pointer since we might be called from irq - * context. + /* + * Take rcu_read_lock() in case the kioctx is being destroyed, as we + * need to issue a wakeup after decrementing reqs_active. */ - spin_lock_irqsave(&ctx->ctx_lock, flags); + rcu_read_lock(); - if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list)) - list_del_init(&iocb->ki_run_list); + if (iocb->ki_list.next) { + unsigned long flags; + + spin_lock_irqsave(&ctx->ctx_lock, flags); + list_del(&iocb->ki_list); + spin_unlock_irqrestore(&ctx->ctx_lock, flags); + } /* * cancelled requests don't get events, userland was given one * when the event got cancelled. */ - if (kiocbIsCancelled(iocb)) + if (unlikely(xchg(&iocb->ki_cancel, + KIOCB_CANCELLED) == KIOCB_CANCELLED)) { + atomic_dec(&ctx->reqs_active); + /* Still need the wake_up in case free_ioctx is waiting */ goto put_rq; + } - ring = kmap_atomic(info->ring_pages[0]); + /* + * Add a completion event to the ring buffer. Must be done holding + * ctx->ctx_lock to prevent other code from messing with the tail + * pointer since we might be called from irq context. + */ + spin_lock_irqsave(&ctx->completion_lock, flags); + + tail = ctx->tail; + pos = tail + AIO_EVENTS_OFFSET; - tail = info->tail; - event = aio_ring_event(info, tail); - if (++tail >= info->nr) + if (++tail >= ctx->nr_events) tail = 0; + ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); + event = ev_page + pos % AIO_EVENTS_PER_PAGE; + event->obj = (u64)(unsigned long)iocb->ki_obj.user; event->data = iocb->ki_user_data; event->res = res; event->res2 = res2; - dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n", - ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data, - res, res2); + kunmap_atomic(ev_page); + flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); + + pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n", + ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data, + res, res2); /* after flagging the request as done, we * must never even look at it again */ smp_wmb(); /* make event visible before updating tail */ - info->tail = tail; - ring->tail = tail; + ctx->tail = tail; - put_aio_ring_event(event); + ring = kmap_atomic(ctx->ring_pages[0]); + ring->tail = tail; kunmap_atomic(ring); + flush_dcache_page(ctx->ring_pages[0]); - pr_debug("added to ring %p at [%lu]\n", iocb, tail); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + + pr_debug("added to ring %p at [%u]\n", iocb, tail); /* * Check if the user asked us to deliver the result through an @@ -975,7 +677,7 @@ int aio_complete(struct kiocb *iocb, long res, long res2) put_rq: /* everything turned out well, dispose of the aiocb. */ - ret = __aio_put_req(ctx, iocb); + aio_put_req(iocb); /* * We have to order our ring_info tail store above and test @@ -988,233 +690,133 @@ put_rq: if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); - spin_unlock_irqrestore(&ctx->ctx_lock, flags); - return ret; + rcu_read_unlock(); } EXPORT_SYMBOL(aio_complete); -/* aio_read_evt - * Pull an event off of the ioctx's event ring. Returns the number of - * events fetched (0 or 1 ;-) - * FIXME: make this use cmpxchg. - * TODO: make the ringbuffer user mmap()able (requires FIXME). +/* aio_read_events + * Pull an event off of the ioctx's event ring. Returns the number of + * events fetched */ -static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) +static long aio_read_events_ring(struct kioctx *ctx, + struct io_event __user *event, long nr) { - struct aio_ring_info *info = &ioctx->ring_info; struct aio_ring *ring; - unsigned long head; - int ret = 0; - - ring = kmap_atomic(info->ring_pages[0]); - dprintk("in aio_read_evt h%lu t%lu m%lu\n", - (unsigned long)ring->head, (unsigned long)ring->tail, - (unsigned long)ring->nr); - - if (ring->head == ring->tail) - goto out; + unsigned head, pos; + long ret = 0; + int copy_ret; - spin_lock(&info->ring_lock); - - head = ring->head % info->nr; - if (head != ring->tail) { - struct io_event *evp = aio_ring_event(info, head); - *ent = *evp; - head = (head + 1) % info->nr; - smp_mb(); /* finish reading the event before updatng the head */ - ring->head = head; - ret = 1; - put_aio_ring_event(evp); - } - spin_unlock(&info->ring_lock); + mutex_lock(&ctx->ring_lock); -out: + ring = kmap_atomic(ctx->ring_pages[0]); + head = ring->head; kunmap_atomic(ring); - dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret, - (unsigned long)ring->head, (unsigned long)ring->tail); - return ret; -} -struct aio_timeout { - struct timer_list timer; - int timed_out; - struct task_struct *p; -}; + pr_debug("h%u t%u m%u\n", head, ctx->tail, ctx->nr_events); -static void timeout_func(unsigned long data) -{ - struct aio_timeout *to = (struct aio_timeout *)data; + if (head == ctx->tail) + goto out; - to->timed_out = 1; - wake_up_process(to->p); -} + while (ret < nr) { + long avail; + struct io_event *ev; + struct page *page; -static inline void init_timeout(struct aio_timeout *to) -{ - setup_timer_on_stack(&to->timer, timeout_func, (unsigned long) to); - to->timed_out = 0; - to->p = current; -} + avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; + if (head == ctx->tail) + break; -static inline void set_timeout(long start_jiffies, struct aio_timeout *to, - const struct timespec *ts) -{ - to->timer.expires = start_jiffies + timespec_to_jiffies(ts); - if (time_after(to->timer.expires, jiffies)) - add_timer(&to->timer); - else - to->timed_out = 1; -} + avail = min(avail, nr - ret); + avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - + ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE)); -static inline void clear_timeout(struct aio_timeout *to) -{ - del_singleshot_timer_sync(&to->timer); -} + pos = head + AIO_EVENTS_OFFSET; + page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]; + pos %= AIO_EVENTS_PER_PAGE; -static int read_events(struct kioctx *ctx, - long min_nr, long nr, - struct io_event __user *event, - struct timespec __user *timeout) -{ - long start_jiffies = jiffies; - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); - int ret; - int i = 0; - struct io_event ent; - struct aio_timeout to; - int retry = 0; - - /* needed to zero any padding within an entry (there shouldn't be - * any, but C is fun! - */ - memset(&ent, 0, sizeof(ent)); -retry: - ret = 0; - while (likely(i < nr)) { - ret = aio_read_evt(ctx, &ent); - if (unlikely(ret <= 0)) - break; + ev = kmap(page); + copy_ret = copy_to_user(event + ret, ev + pos, + sizeof(*ev) * avail); + kunmap(page); - dprintk("read event: %Lx %Lx %Lx %Lx\n", - ent.data, ent.obj, ent.res, ent.res2); - - /* Could we split the check in two? */ - ret = -EFAULT; - if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { - dprintk("aio: lost an event due to EFAULT.\n"); - break; + if (unlikely(copy_ret)) { + ret = -EFAULT; + goto out; } - ret = 0; - /* Good, event copied to userland, update counts. */ - event ++; - i ++; + ret += avail; + head += avail; + head %= ctx->nr_events; } - if (min_nr <= i) - return i; - if (ret) - return ret; + ring = kmap_atomic(ctx->ring_pages[0]); + ring->head = head; + kunmap_atomic(ring); + flush_dcache_page(ctx->ring_pages[0]); - /* End fast path */ + pr_debug("%li h%u t%u\n", ret, head, ctx->tail); - /* racey check, but it gets redone */ - if (!retry && unlikely(!list_empty(&ctx->run_list))) { - retry = 1; - aio_run_all_iocbs(ctx); - goto retry; - } + atomic_sub(ret, &ctx->reqs_active); +out: + mutex_unlock(&ctx->ring_lock); - init_timeout(&to); - if (timeout) { - struct timespec ts; - ret = -EFAULT; - if (unlikely(copy_from_user(&ts, timeout, sizeof(ts)))) - goto out; + return ret; +} - set_timeout(start_jiffies, &to, &ts); - } +static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr, + struct io_event __user *event, long *i) +{ + long ret = aio_read_events_ring(ctx, event + *i, nr - *i); - while (likely(i < nr)) { - add_wait_queue_exclusive(&ctx->wait, &wait); - do { - set_task_state(tsk, TASK_INTERRUPTIBLE); - ret = aio_read_evt(ctx, &ent); - if (ret) - break; - if (min_nr <= i) - break; - if (unlikely(ctx->dead)) { - ret = -EINVAL; - break; - } - if (to.timed_out) /* Only check after read evt */ - break; - /* Try to only show up in io wait if there are ops - * in flight */ - if (ctx->reqs_active) - io_schedule(); - else - schedule(); - if (signal_pending(tsk)) { - ret = -EINTR; - break; - } - /*ret = aio_read_evt(ctx, &ent);*/ - } while (1) ; - - set_task_state(tsk, TASK_RUNNING); - remove_wait_queue(&ctx->wait, &wait); - - if (unlikely(ret <= 0)) - break; + if (ret > 0) + *i += ret; - ret = -EFAULT; - if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { - dprintk("aio: lost an event due to EFAULT.\n"); - break; - } + if (unlikely(atomic_read(&ctx->dead))) + ret = -EINVAL; - /* Good, event copied to userland, update counts. */ - event ++; - i ++; - } + if (!*i) + *i = ret; - if (timeout) - clear_timeout(&to); -out: - destroy_timer_on_stack(&to.timer); - return i ? i : ret; + return ret < 0 || *i >= min_nr; } -/* Take an ioctx and remove it from the list of ioctx's. Protects - * against races with itself via ->dead. - */ -static void io_destroy(struct kioctx *ioctx) +static long read_events(struct kioctx *ctx, long min_nr, long nr, + struct io_event __user *event, + struct timespec __user *timeout) { - struct mm_struct *mm = current->mm; - int was_dead; + ktime_t until = { .tv64 = KTIME_MAX }; + long ret = 0; - /* delete the entry from the list is someone else hasn't already */ - spin_lock(&mm->ioctx_lock); - was_dead = ioctx->dead; - ioctx->dead = 1; - hlist_del_rcu(&ioctx->list); - spin_unlock(&mm->ioctx_lock); + if (timeout) { + struct timespec ts; - dprintk("aio_release(%p)\n", ioctx); - if (likely(!was_dead)) - put_ioctx(ioctx); /* twice for the list */ + if (unlikely(copy_from_user(&ts, timeout, sizeof(ts)))) + return -EFAULT; - kill_ctx(ioctx); + until = timespec_to_ktime(ts); + } /* - * Wake up any waiters. The setting of ctx->dead must be seen - * by other CPUs at this point. Right now, we rely on the - * locking done by the above calls to ensure this consistency. + * Note that aio_read_events() is being called as the conditional - i.e. + * we're calling it after prepare_to_wait() has set task state to + * TASK_INTERRUPTIBLE. + * + * But aio_read_events() can block, and if it blocks it's going to flip + * the task state back to TASK_RUNNING. + * + * This should be ok, provided it doesn't flip the state back to + * TASK_RUNNING and return 0 too much - that causes us to spin. That + * will only happen if the mutex_lock() call blocks, and we then find + * the ringbuffer empty. So in practice we should be ok, but it's + * something to be aware of when touching this code. */ - wake_up_all(&ioctx->wait); + wait_event_interruptible_hrtimeout(ctx->wait, + aio_read_events(ctx, min_nr, nr, event, &ret), until); + + if (!ret && signal_pending(current)) + ret = -EINTR; + + return ret; } /* sys_io_setup: @@ -1252,7 +854,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) if (!IS_ERR(ioctx)) { ret = put_user(ioctx->user_id, ctxp); if (ret) - io_destroy(ioctx); + kill_ioctx(ioctx); put_ioctx(ioctx); } @@ -1270,7 +872,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) { struct kioctx *ioctx = lookup_ioctx(ctx); if (likely(NULL != ioctx)) { - io_destroy(ioctx); + kill_ioctx(ioctx); put_ioctx(ioctx); return 0; } @@ -1301,29 +903,22 @@ static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret) BUG_ON(ret > 0 && iocb->ki_left == 0); } -static ssize_t aio_rw_vect_retry(struct kiocb *iocb) +typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *, + unsigned long, loff_t); + +static ssize_t aio_rw_vect_retry(struct kiocb *iocb, int rw, aio_rw_op *rw_op) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - ssize_t (*rw_op)(struct kiocb *, const struct iovec *, - unsigned long, loff_t); ssize_t ret = 0; - unsigned short opcode; - - if ((iocb->ki_opcode == IOCB_CMD_PREADV) || - (iocb->ki_opcode == IOCB_CMD_PREAD)) { - rw_op = file->f_op->aio_read; - opcode = IOCB_CMD_PREADV; - } else { - rw_op = file->f_op->aio_write; - opcode = IOCB_CMD_PWRITEV; - } /* This matches the pread()/pwrite() logic */ if (iocb->ki_pos < 0) return -EINVAL; + if (rw == WRITE) + file_start_write(file); do { ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg], iocb->ki_nr_segs - iocb->ki_cur_seg, @@ -1334,8 +929,10 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb) /* retry all partial writes. retry partial reads as long as its a * regular file. */ } while (ret > 0 && iocb->ki_left > 0 && - (opcode == IOCB_CMD_PWRITEV || + (rw == WRITE || (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode)))); + if (rw == WRITE) + file_end_write(file); /* This means we must have transferred all that we could */ /* No need to retry anymore */ @@ -1344,81 +941,49 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb) /* If we managed to write some out we return that, rather than * the eventual error. */ - if (opcode == IOCB_CMD_PWRITEV - && ret < 0 && ret != -EIOCBQUEUED && ret != -EIOCBRETRY + if (rw == WRITE + && ret < 0 && ret != -EIOCBQUEUED && iocb->ki_nbytes - iocb->ki_left) ret = iocb->ki_nbytes - iocb->ki_left; return ret; } -static ssize_t aio_fdsync(struct kiocb *iocb) -{ - struct file *file = iocb->ki_filp; - ssize_t ret = -EINVAL; - - if (file->f_op->aio_fsync) - ret = file->f_op->aio_fsync(iocb, 1); - return ret; -} - -static ssize_t aio_fsync(struct kiocb *iocb) -{ - struct file *file = iocb->ki_filp; - ssize_t ret = -EINVAL; - - if (file->f_op->aio_fsync) - ret = file->f_op->aio_fsync(iocb, 0); - return ret; -} - -static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat) +static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat) { ssize_t ret; + kiocb->ki_nr_segs = kiocb->ki_nbytes; + #ifdef CONFIG_COMPAT if (compat) - ret = compat_rw_copy_check_uvector(type, + ret = compat_rw_copy_check_uvector(rw, (struct compat_iovec __user *)kiocb->ki_buf, - kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, + kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec, &kiocb->ki_iovec); else #endif - ret = rw_copy_check_uvector(type, + ret = rw_copy_check_uvector(rw, (struct iovec __user *)kiocb->ki_buf, - kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, + kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec, &kiocb->ki_iovec); if (ret < 0) - goto out; - - ret = rw_verify_area(type, kiocb->ki_filp, &kiocb->ki_pos, ret); - if (ret < 0) - goto out; + return ret; - kiocb->ki_nr_segs = kiocb->ki_nbytes; - kiocb->ki_cur_seg = 0; - /* ki_nbytes/left now reflect bytes instead of segs */ + /* ki_nbytes now reflect bytes instead of segs */ kiocb->ki_nbytes = ret; - kiocb->ki_left = ret; - - ret = 0; -out: - return ret; + return 0; } -static ssize_t aio_setup_single_vector(int type, struct file * file, struct kiocb *kiocb) +static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb) { - int bytes; - - bytes = rw_verify_area(type, file, &kiocb->ki_pos, kiocb->ki_left); - if (bytes < 0) - return bytes; + if (unlikely(!access_ok(!rw, kiocb->ki_buf, kiocb->ki_nbytes))) + return -EFAULT; kiocb->ki_iovec = &kiocb->ki_inline_vec; kiocb->ki_iovec->iov_base = kiocb->ki_buf; - kiocb->ki_iovec->iov_len = bytes; + kiocb->ki_iovec->iov_len = kiocb->ki_nbytes; kiocb->ki_nr_segs = 1; - kiocb->ki_cur_seg = 0; return 0; } @@ -1427,96 +992,95 @@ static ssize_t aio_setup_single_vector(int type, struct file * file, struct kioc * Performs the initial checks and aio retry method * setup for the kiocb at the time of io submission. */ -static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat) +static ssize_t aio_run_iocb(struct kiocb *req, bool compat) { - struct file *file = kiocb->ki_filp; - ssize_t ret = 0; + struct file *file = req->ki_filp; + ssize_t ret; + int rw; + fmode_t mode; + aio_rw_op *rw_op; - switch (kiocb->ki_opcode) { + switch (req->ki_opcode) { case IOCB_CMD_PREAD: - ret = -EBADF; - if (unlikely(!(file->f_mode & FMODE_READ))) - break; - ret = -EFAULT; - if (unlikely(!access_ok(VERIFY_WRITE, kiocb->ki_buf, - kiocb->ki_left))) - break; - ret = aio_setup_single_vector(READ, file, kiocb); - if (ret) - break; - ret = -EINVAL; - if (file->f_op->aio_read) - kiocb->ki_retry = aio_rw_vect_retry; - break; - case IOCB_CMD_PWRITE: - ret = -EBADF; - if (unlikely(!(file->f_mode & FMODE_WRITE))) - break; - ret = -EFAULT; - if (unlikely(!access_ok(VERIFY_READ, kiocb->ki_buf, - kiocb->ki_left))) - break; - ret = aio_setup_single_vector(WRITE, file, kiocb); - if (ret) - break; - ret = -EINVAL; - if (file->f_op->aio_write) - kiocb->ki_retry = aio_rw_vect_retry; - break; case IOCB_CMD_PREADV: - ret = -EBADF; - if (unlikely(!(file->f_mode & FMODE_READ))) - break; - ret = aio_setup_vectored_rw(READ, kiocb, compat); - if (ret) - break; - ret = -EINVAL; - if (file->f_op->aio_read) - kiocb->ki_retry = aio_rw_vect_retry; - break; + mode = FMODE_READ; + rw = READ; + rw_op = file->f_op->aio_read; + goto rw_common; + + case IOCB_CMD_PWRITE: case IOCB_CMD_PWRITEV: - ret = -EBADF; - if (unlikely(!(file->f_mode & FMODE_WRITE))) - break; - ret = aio_setup_vectored_rw(WRITE, kiocb, compat); + mode = FMODE_WRITE; + rw = WRITE; + rw_op = file->f_op->aio_write; + goto rw_common; +rw_common: + if (unlikely(!(file->f_mode & mode))) + return -EBADF; + + if (!rw_op) + return -EINVAL; + + ret = (req->ki_opcode == IOCB_CMD_PREADV || + req->ki_opcode == IOCB_CMD_PWRITEV) + ? aio_setup_vectored_rw(rw, req, compat) + : aio_setup_single_vector(rw, req); if (ret) - break; - ret = -EINVAL; - if (file->f_op->aio_write) - kiocb->ki_retry = aio_rw_vect_retry; + return ret; + + ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); + if (ret < 0) + return ret; + + req->ki_nbytes = ret; + req->ki_left = ret; + + ret = aio_rw_vect_retry(req, rw, rw_op); break; + case IOCB_CMD_FDSYNC: - ret = -EINVAL; - if (file->f_op->aio_fsync) - kiocb->ki_retry = aio_fdsync; + if (!file->f_op->aio_fsync) + return -EINVAL; + + ret = file->f_op->aio_fsync(req, 1); break; + case IOCB_CMD_FSYNC: - ret = -EINVAL; - if (file->f_op->aio_fsync) - kiocb->ki_retry = aio_fsync; + if (!file->f_op->aio_fsync) + return -EINVAL; + + ret = file->f_op->aio_fsync(req, 0); break; + default: - dprintk("EINVAL: io_submit: no operation provided\n"); - ret = -EINVAL; + pr_debug("EINVAL: no operation provided\n"); + return -EINVAL; } - if (!kiocb->ki_retry) - return ret; + if (ret != -EIOCBQUEUED) { + /* + * There's no easy way to restart the syscall since other AIO's + * may be already running. Just fail this IO with EINTR. + */ + if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR || + ret == -ERESTARTNOHAND || + ret == -ERESTART_RESTARTBLOCK)) + ret = -EINTR; + aio_complete(req, ret, 0); + } return 0; } static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, - struct iocb *iocb, struct kiocb_batch *batch, - bool compat) + struct iocb *iocb, bool compat) { struct kiocb *req; - struct file *file; ssize_t ret; /* enforce forwards compatibility on users */ if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) { - pr_debug("EINVAL: io_submit: reserve field set\n"); + pr_debug("EINVAL: reserve field set\n"); return -EINVAL; } @@ -1530,16 +1094,16 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, return -EINVAL; } - file = fget(iocb->aio_fildes); - if (unlikely(!file)) - return -EBADF; - - req = aio_get_req(ctx, batch); /* returns with 2 references to req */ - if (unlikely(!req)) { - fput(file); + req = aio_get_req(ctx); + if (unlikely(!req)) return -EAGAIN; + + req->ki_filp = fget(iocb->aio_fildes); + if (unlikely(!req->ki_filp)) { + ret = -EBADF; + goto out_put_req; } - req->ki_filp = file; + if (iocb->aio_flags & IOCB_FLAG_RESFD) { /* * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an @@ -1555,9 +1119,9 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, } } - ret = put_user(req->ki_key, &user_iocb->aio_key); + ret = put_user(KIOCB_KEY, &user_iocb->aio_key); if (unlikely(ret)) { - dprintk("EFAULT: aio_key\n"); + pr_debug("EFAULT: aio_key\n"); goto out_put_req; } @@ -1569,41 +1133,14 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->ki_left = req->ki_nbytes = iocb->aio_nbytes; req->ki_opcode = iocb->aio_lio_opcode; - ret = aio_setup_iocb(req, compat); - + ret = aio_run_iocb(req, compat); if (ret) goto out_put_req; - spin_lock_irq(&ctx->ctx_lock); - /* - * We could have raced with io_destroy() and are currently holding a - * reference to ctx which should be destroyed. We cannot submit IO - * since ctx gets freed as soon as io_submit() puts its reference. The - * check here is reliable: io_destroy() sets ctx->dead before waiting - * for outstanding IO and the barrier between these two is realized by - * unlock of mm->ioctx_lock and lock of ctx->ctx_lock. Analogously we - * increment ctx->reqs_active before checking for ctx->dead and the - * barrier is realized by unlock and lock of ctx->ctx_lock. Thus if we - * don't see ctx->dead set here, io_destroy() waits for our IO to - * finish. - */ - if (ctx->dead) { - spin_unlock_irq(&ctx->ctx_lock); - ret = -EINVAL; - goto out_put_req; - } - aio_run_iocb(req); - if (!list_empty(&ctx->run_list)) { - /* drain the run list */ - while (__aio_run_iocbs(ctx)) - ; - } - spin_unlock_irq(&ctx->ctx_lock); - aio_put_req(req); /* drop extra ref to req */ return 0; - out_put_req: + atomic_dec(&ctx->reqs_active); aio_put_req(req); /* drop extra ref to req */ aio_put_req(req); /* drop i/o ref to req */ return ret; @@ -1616,7 +1153,6 @@ long do_io_submit(aio_context_t ctx_id, long nr, long ret = 0; int i = 0; struct blk_plug plug; - struct kiocb_batch batch; if (unlikely(nr < 0)) return -EINVAL; @@ -1629,12 +1165,10 @@ long do_io_submit(aio_context_t ctx_id, long nr, ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) { - pr_debug("EINVAL: io_submit: invalid context id\n"); + pr_debug("EINVAL: invalid context id\n"); return -EINVAL; } - kiocb_batch_init(&batch, nr); - blk_start_plug(&plug); /* @@ -1655,13 +1189,12 @@ long do_io_submit(aio_context_t ctx_id, long nr, break; } - ret = io_submit_one(ctx, user_iocb, &tmp, &batch, compat); + ret = io_submit_one(ctx, user_iocb, &tmp, compat); if (ret) break; } blk_finish_plug(&plug); - kiocb_batch_free(ctx, &batch); put_ioctx(ctx); return i ? i : ret; } @@ -1694,10 +1227,13 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, assert_spin_locked(&ctx->ctx_lock); + if (key != KIOCB_KEY) + return NULL; + /* TODO: use a hash or array, this sucks. */ list_for_each(pos, &ctx->active_reqs) { struct kiocb *kiocb = list_kiocb(pos); - if (kiocb->ki_obj.user == iocb && kiocb->ki_key == key) + if (kiocb->ki_obj.user == iocb) return kiocb; } return NULL; @@ -1716,7 +1252,7 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, struct io_event __user *, result) { - int (*cancel)(struct kiocb *iocb, struct io_event *res); + struct io_event res; struct kioctx *ctx; struct kiocb *kiocb; u32 key; @@ -1731,32 +1267,22 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, return -EINVAL; spin_lock_irq(&ctx->ctx_lock); - ret = -EAGAIN; + kiocb = lookup_kiocb(ctx, iocb, key); - if (kiocb && kiocb->ki_cancel) { - cancel = kiocb->ki_cancel; - kiocb->ki_users ++; - kiocbSetCancelled(kiocb); - } else - cancel = NULL; + if (kiocb) + ret = kiocb_cancel(ctx, kiocb, &res); + else + ret = -EINVAL; + spin_unlock_irq(&ctx->ctx_lock); - if (NULL != cancel) { - struct io_event tmp; - pr_debug("calling cancel\n"); - memset(&tmp, 0, sizeof(tmp)); - tmp.obj = (u64)(unsigned long)kiocb->ki_obj.user; - tmp.data = kiocb->ki_user_data; - ret = cancel(kiocb, &tmp); - if (!ret) { - /* Cancellation succeeded -- copy the result - * into the user's buffer. - */ - if (copy_to_user(result, &tmp, sizeof(tmp))) - ret = -EFAULT; - } - } else - ret = -EINVAL; + if (!ret) { + /* Cancellation succeeded -- copy the result + * into the user's buffer. + */ + if (copy_to_user(result, &res, sizeof(res))) + ret = -EFAULT; + } put_ioctx(ctx); @@ -1790,7 +1316,5 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, ret = read_events(ioctx, min_nr, nr, events, timeout); put_ioctx(ioctx); } - - asmlinkage_protect(5, ret, ctx_id, min_nr, nr, events, timeout); return ret; } diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 01443ce43ee7..13ddec92341c 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -61,15 +61,6 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) /* This is an autofs submount, we can't expire it */ if (autofs_type_indirect(sbi->type)) goto done; - - /* - * Otherwise it's an offset mount and we need to check - * if we can umount its mount, if there is one. - */ - if (!d_mountpoint(path.dentry)) { - status = 0; - goto done; - } } /* Update the expiry counter if fs is busy */ diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c index cddc74b9cdb2..b3db517e89ec 100644 --- a/fs/autofs4/init.c +++ b/fs/autofs4/init.c @@ -26,6 +26,7 @@ static struct file_system_type autofs_fs_type = { .mount = autofs_mount, .kill_sb = autofs4_kill_sb, }; +MODULE_ALIAS_FS("autofs"); static int __init init_autofs4_fs(void) { diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 9bd16255dd9c..085da86e07c2 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -408,7 +408,7 @@ done: return NULL; } -int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) +static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); diff --git a/fs/befs/btree.c b/fs/befs/btree.c index a66c9b1136e0..74e397db0b8b 100644 --- a/fs/befs/btree.c +++ b/fs/befs/btree.c @@ -436,8 +436,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds, goto error; } - if ((this_node = (befs_btree_node *) - kmalloc(sizeof (befs_btree_node), GFP_NOFS)) == NULL) { + if ((this_node = kmalloc(sizeof (befs_btree_node), GFP_NOFS)) == NULL) { befs_error(sb, "befs_btree_read() failed to allocate %u " "bytes of memory", sizeof (befs_btree_node)); goto error; diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index c8f4e25eb9e2..8615ee89ab55 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -951,6 +951,7 @@ static struct file_system_type befs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("befs"); static int __init init_befs_fs(void) diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 737aaa3f7090..5e376bb93419 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -473,6 +473,7 @@ static struct file_system_type bfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("bfs"); static int __init init_bfs_fs(void) { diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index bbc8f8827eac..bce87694f7b0 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -62,7 +62,6 @@ static int aout_core_dump(struct coredump_params *cprm) fs = get_fs(); set_fs(KERNEL_DS); has_dumped = 1; - current->flags |= PF_DUMPCORE; strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm)); dump.u_ar0 = offsetof(struct user, regs); dump.signal = cprm->siginfo->si_signo; @@ -287,15 +286,12 @@ static int load_aout_binary(struct linux_binprm * bprm) return error; } - error = bprm->file->f_op->read(bprm->file, - (char __user *)text_addr, - ex.a_text+ex.a_data, &pos); + error = read_code(bprm->file, text_addr, pos, + ex.a_text+ex.a_data); if ((signed long)error < 0) { send_sig(SIGKILL, current, 0); return error; } - - flush_icache_range(text_addr, text_addr+ex.a_text+ex.a_data); } else { if ((ex.a_text & 0xfff || ex.a_data & 0xfff) && (N_MAGIC(ex) != NMAGIC) && printk_ratelimit()) @@ -311,14 +307,9 @@ static int load_aout_binary(struct linux_binprm * bprm) } if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) { - loff_t pos = fd_offset; vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); - bprm->file->f_op->read(bprm->file, - (char __user *)N_TXTADDR(ex), - ex.a_text+ex.a_data, &pos); - flush_icache_range((unsigned long) N_TXTADDR(ex), - (unsigned long) N_TXTADDR(ex) + - ex.a_text+ex.a_data); + read_code(bprm->file, N_TXTADDR(ex), fd_offset, + ex.a_text + ex.a_data); goto beyond_if; } @@ -397,8 +388,6 @@ static int load_aout_library(struct file *file) start_addr = ex.a_entry & 0xfffff000; if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) { - loff_t pos = N_TXTOFF(ex); - if (printk_ratelimit()) { printk(KERN_WARNING @@ -407,11 +396,8 @@ static int load_aout_library(struct file *file) } vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); - file->f_op->read(file, (char __user *)start_addr, - ex.a_text + ex.a_data, &pos); - flush_icache_range((unsigned long) start_addr, - (unsigned long) start_addr + ex.a_text + ex.a_data); - + read_code(file, start_addr, N_TXTOFF(ex), + ex.a_text + ex.a_data); retval = 0; goto out; } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 3939829f6c5c..f8a0b0efda44 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -240,6 +240,9 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, NEW_AUX_ENT(AT_EGID, from_kgid_munged(cred->user_ns, cred->egid)); NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes); +#ifdef ELF_HWCAP2 + NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2); +#endif NEW_AUX_ENT(AT_EXECFN, bprm->exec); if (k_platform) { NEW_AUX_ENT(AT_PLATFORM, @@ -803,7 +806,8 @@ static int load_elf_binary(struct linux_binprm *bprm) * follow the loader, and is not movable. */ #ifdef CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE /* Memory randomization might have been switched off - * in runtime via sysctl. + * in runtime via sysctl or explicit setting of + * personality flags. * If that is the case, retain the original non-zero * load_bias value in order to establish proper * non-randomized mappings. @@ -1137,6 +1141,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, goto whole; if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE)) goto whole; + return 0; } /* Do not dump I/O mapped devices or special mappings */ @@ -2090,8 +2095,7 @@ static int elf_core_dump(struct coredump_params *cprm) goto cleanup; has_dumped = 1; - current->flags |= PF_DUMPCORE; - + fs = get_fs(); set_fs(KERNEL_DS); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 9c13e023e2b7..c166f325a183 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -483,7 +483,6 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, size_t platform_len = 0, len; char *k_platform, *k_base_platform; char __user *u_platform, *u_base_platform, *p; - long hwcap; int loop; int nr; /* reset for each csp adjustment */ @@ -502,8 +501,6 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, return -EFAULT; #endif - hwcap = ELF_HWCAP; - /* * If this architecture has a platform capability string, copy it * to userspace. In some cases (Sparc), this info is impossible @@ -617,7 +614,10 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, nr = 0; csp -= DLINFO_ITEMS * 2 * sizeof(unsigned long); - NEW_AUX_ENT(AT_HWCAP, hwcap); + NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP); +#ifdef ELF_HWCAP2 + NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2); +#endif NEW_AUX_ENT(AT_PAGESZ, PAGE_SIZE); NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC); NEW_AUX_ENT(AT_PHDR, exec_params->ph_addr); @@ -926,7 +926,6 @@ static int elf_fdpic_map_file_constdisp_on_uclinux( struct elf32_fdpic_loadseg *seg; struct elf32_phdr *phdr; unsigned long load_addr, base = ULONG_MAX, top = 0, maddr = 0, mflags; - loff_t fpos; int loop, ret; load_addr = params->load_addr; @@ -964,14 +963,12 @@ static int elf_fdpic_map_file_constdisp_on_uclinux( if (params->phdrs[loop].p_type != PT_LOAD) continue; - fpos = phdr->p_offset; - seg->addr = maddr + (phdr->p_vaddr - base); seg->p_vaddr = phdr->p_vaddr; seg->p_memsz = phdr->p_memsz; - ret = file->f_op->read(file, (void *) seg->addr, - phdr->p_filesz, &fpos); + ret = read_code(file, seg->addr, phdr->p_offset, + phdr->p_filesz); if (ret < 0) return ret; @@ -1687,8 +1684,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) fill_elf_fdpic_header(elf, e_phnum); has_dumped = 1; - current->flags |= PF_DUMPCORE; - /* * Set up the notes in similar form to SVR4 core dumps made * with info from their /proc. diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 2036d21baaef..d50bbe59da1e 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -207,11 +207,12 @@ static int decompress_exec( /* Read in first chunk of data and parse gzip header. */ fpos = offset; - ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos); + ret = kernel_read(bprm->file, offset, buf, LBUFSIZE); strm.next_in = buf; strm.avail_in = ret; strm.total_in = 0; + fpos += ret; retval = -ENOEXEC; @@ -277,7 +278,7 @@ static int decompress_exec( } while ((ret = zlib_inflate(&strm, Z_NO_FLUSH)) == Z_OK) { - ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos); + ret = kernel_read(bprm->file, fpos, buf, LBUFSIZE); if (ret <= 0) break; len -= ret; @@ -285,6 +286,7 @@ static int decompress_exec( strm.next_in = buf; strm.avail_in = ret; strm.total_in = 0; + fpos += ret; } if (ret < 0) { @@ -428,6 +430,7 @@ static int load_flat_file(struct linux_binprm * bprm, unsigned long textpos = 0, datapos = 0, result; unsigned long realdatastart = 0; unsigned long text_len, data_len, bss_len, stack_len, flags; + unsigned long full_data; unsigned long len, memp = 0; unsigned long memp_size, extra, rlim; unsigned long *reloc = 0, *rp; @@ -451,6 +454,7 @@ static int load_flat_file(struct linux_binprm * bprm, relocs = ntohl(hdr->reloc_count); flags = ntohl(hdr->flags); rev = ntohl(hdr->rev); + full_data = data_len + relocs * sizeof(unsigned long); if (strncmp(hdr->magic, "bFLT", 4)) { /* @@ -577,12 +581,12 @@ static int load_flat_file(struct linux_binprm * bprm, #ifdef CONFIG_BINFMT_ZFLAT if (flags & FLAT_FLAG_GZDATA) { result = decompress_exec(bprm, fpos, (char *) datapos, - data_len + (relocs * sizeof(unsigned long)), 0); + full_data, 0); } else #endif { - result = bprm->file->f_op->read(bprm->file, (char *) datapos, - data_len + (relocs * sizeof(unsigned long)), &fpos); + result = read_code(bprm->file, datapos, fpos, + full_data); } if (IS_ERR_VALUE(result)) { printk("Unable to read data+bss, errno %d\n", (int)-result); @@ -627,30 +631,25 @@ static int load_flat_file(struct linux_binprm * bprm, if (flags & FLAT_FLAG_GZIP) { result = decompress_exec(bprm, sizeof (struct flat_hdr), (((char *) textpos) + sizeof (struct flat_hdr)), - (text_len + data_len + (relocs * sizeof(unsigned long)) + (text_len + full_data - sizeof (struct flat_hdr)), 0); memmove((void *) datapos, (void *) realdatastart, - data_len + (relocs * sizeof(unsigned long))); + full_data); } else if (flags & FLAT_FLAG_GZDATA) { - fpos = 0; - result = bprm->file->f_op->read(bprm->file, - (char *) textpos, text_len, &fpos); + result = read_code(bprm->file, textpos, 0, text_len); if (!IS_ERR_VALUE(result)) result = decompress_exec(bprm, text_len, (char *) datapos, - data_len + (relocs * sizeof(unsigned long)), 0); + full_data, 0); } else #endif { - fpos = 0; - result = bprm->file->f_op->read(bprm->file, - (char *) textpos, text_len, &fpos); - if (!IS_ERR_VALUE(result)) { - fpos = ntohl(hdr->data_start); - result = bprm->file->f_op->read(bprm->file, (char *) datapos, - data_len + (relocs * sizeof(unsigned long)), &fpos); - } + result = read_code(bprm->file, textpos, 0, text_len); + if (!IS_ERR_VALUE(result)) + result = read_code(bprm->file, datapos, + ntohl(hdr->data_start), + full_data); } if (IS_ERR_VALUE(result)) { printk("Unable to read code+data+bss, errno %d\n",(int)-result); diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index fecbbf3f8ff2..1c740e152f38 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -23,6 +23,7 @@ #include <linux/binfmts.h> #include <linux/slab.h> #include <linux/ctype.h> +#include <linux/string_helpers.h> #include <linux/file.h> #include <linux/pagemap.h> #include <linux/namei.h> @@ -234,24 +235,6 @@ static char *scanarg(char *s, char del) return s; } -static int unquote(char *from) -{ - char c = 0, *s = from, *p = from; - - while ((c = *s++) != '\0') { - if (c == '\\' && *s == 'x') { - s++; - c = toupper(*s++); - *p = (c - (isdigit(c) ? '0' : 'A' - 10)) << 4; - c = toupper(*s++); - *p++ |= c - (isdigit(c) ? '0' : 'A' - 10); - continue; - } - *p++ = c; - } - return p - from; -} - static char * check_special_flags (char * sfs, Node * e) { char * p = sfs; @@ -354,8 +337,9 @@ static Node *create_entry(const char __user *buffer, size_t count) p[-1] = '\0'; if (!e->mask[0]) e->mask = NULL; - e->size = unquote(e->magic); - if (e->mask && unquote(e->mask) != e->size) + e->size = string_unescape_inplace(e->magic, UNESCAPE_HEX); + if (e->mask && + string_unescape_inplace(e->mask, UNESCAPE_HEX) != e->size) goto Einval; if (e->size + e->offset > BINPRM_BUF_SIZE) goto Einval; @@ -720,6 +704,7 @@ static struct file_system_type bm_fs_type = { .mount = bm_mount, .kill_sb = kill_litter_super, }; +MODULE_ALIAS_FS("binfmt_misc"); static int __init init_misc_binfmt(void) { diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index a3f28f331b2b..8fb42916d8a2 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -27,48 +27,11 @@ #include <linux/workqueue.h> #include <linux/slab.h> -struct integrity_slab { - struct kmem_cache *slab; - unsigned short nr_vecs; - char name[8]; -}; - -#define IS(x) { .nr_vecs = x, .name = "bip-"__stringify(x) } -struct integrity_slab bip_slab[BIOVEC_NR_POOLS] __read_mostly = { - IS(1), IS(4), IS(16), IS(64), IS(128), IS(BIO_MAX_PAGES), -}; -#undef IS +#define BIP_INLINE_VECS 4 +static struct kmem_cache *bip_slab; static struct workqueue_struct *kintegrityd_wq; -static inline unsigned int vecs_to_idx(unsigned int nr) -{ - switch (nr) { - case 1: - return 0; - case 2 ... 4: - return 1; - case 5 ... 16: - return 2; - case 17 ... 64: - return 3; - case 65 ... 128: - return 4; - case 129 ... BIO_MAX_PAGES: - return 5; - default: - BUG(); - } -} - -static inline int use_bip_pool(unsigned int idx) -{ - if (idx == BIOVEC_MAX_IDX) - return 1; - - return 0; -} - /** * bio_integrity_alloc - Allocate integrity payload and attach it to bio * @bio: bio to attach integrity metadata to @@ -84,37 +47,41 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, unsigned int nr_vecs) { struct bio_integrity_payload *bip; - unsigned int idx = vecs_to_idx(nr_vecs); struct bio_set *bs = bio->bi_pool; - - if (!bs) - bs = fs_bio_set; - - BUG_ON(bio == NULL); - bip = NULL; - - /* Lower order allocations come straight from slab */ - if (!use_bip_pool(idx)) - bip = kmem_cache_alloc(bip_slab[idx].slab, gfp_mask); - - /* Use mempool if lower order alloc failed or max vecs were requested */ - if (bip == NULL) { - idx = BIOVEC_MAX_IDX; /* so we free the payload properly later */ + unsigned long idx = BIO_POOL_NONE; + unsigned inline_vecs; + + if (!bs) { + bip = kmalloc(sizeof(struct bio_integrity_payload) + + sizeof(struct bio_vec) * nr_vecs, gfp_mask); + inline_vecs = nr_vecs; + } else { bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask); - - if (unlikely(bip == NULL)) { - printk(KERN_ERR "%s: could not alloc bip\n", __func__); - return NULL; - } + inline_vecs = BIP_INLINE_VECS; } + if (unlikely(!bip)) + return NULL; + memset(bip, 0, sizeof(*bip)); + if (nr_vecs > inline_vecs) { + bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx, + bs->bvec_integrity_pool); + if (!bip->bip_vec) + goto err; + } else { + bip->bip_vec = bip->bip_inline_vecs; + } + bip->bip_slab = idx; bip->bip_bio = bio; bio->bi_integrity = bip; return bip; +err: + mempool_free(bip, bs->bio_integrity_pool); + return NULL; } EXPORT_SYMBOL(bio_integrity_alloc); @@ -130,20 +97,18 @@ void bio_integrity_free(struct bio *bio) struct bio_integrity_payload *bip = bio->bi_integrity; struct bio_set *bs = bio->bi_pool; - if (!bs) - bs = fs_bio_set; - - BUG_ON(bip == NULL); - - /* A cloned bio doesn't own the integrity metadata */ - if (!bio_flagged(bio, BIO_CLONED) && !bio_flagged(bio, BIO_FS_INTEGRITY) - && bip->bip_buf != NULL) + if (bip->bip_owns_buf) kfree(bip->bip_buf); - if (use_bip_pool(bip->bip_slab)) + if (bs) { + if (bip->bip_slab != BIO_POOL_NONE) + bvec_free(bs->bvec_integrity_pool, bip->bip_vec, + bip->bip_slab); + mempool_free(bip, bs->bio_integrity_pool); - else - kmem_cache_free(bip_slab[bip->bip_slab].slab, bip); + } else { + kfree(bip); + } bio->bi_integrity = NULL; } @@ -419,6 +384,7 @@ int bio_integrity_prep(struct bio *bio) return -EIO; } + bip->bip_owns_buf = 1; bip->bip_buf = buf; bip->bip_size = len; bip->bip_sector = bio->bi_sector; @@ -694,11 +660,11 @@ void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors) bp->bio1.bi_integrity = &bp->bip1; bp->bio2.bi_integrity = &bp->bip2; - bp->iv1 = bip->bip_vec[0]; - bp->iv2 = bip->bip_vec[0]; + bp->iv1 = bip->bip_vec[bip->bip_idx]; + bp->iv2 = bip->bip_vec[bip->bip_idx]; - bp->bip1.bip_vec[0] = bp->iv1; - bp->bip2.bip_vec[0] = bp->iv2; + bp->bip1.bip_vec = &bp->iv1; + bp->bip2.bip_vec = &bp->iv2; bp->iv1.bv_len = sectors * bi->tuple_size; bp->iv2.bv_offset += sectors * bi->tuple_size; @@ -746,13 +712,14 @@ EXPORT_SYMBOL(bio_integrity_clone); int bioset_integrity_create(struct bio_set *bs, int pool_size) { - unsigned int max_slab = vecs_to_idx(BIO_MAX_PAGES); - if (bs->bio_integrity_pool) return 0; - bs->bio_integrity_pool = - mempool_create_slab_pool(pool_size, bip_slab[max_slab].slab); + bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab); + + bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size); + if (!bs->bvec_integrity_pool) + return -1; if (!bs->bio_integrity_pool) return -1; @@ -765,13 +732,14 @@ void bioset_integrity_free(struct bio_set *bs) { if (bs->bio_integrity_pool) mempool_destroy(bs->bio_integrity_pool); + + if (bs->bvec_integrity_pool) + mempool_destroy(bs->bio_integrity_pool); } EXPORT_SYMBOL(bioset_integrity_free); void __init bio_integrity_init(void) { - unsigned int i; - /* * kintegrityd won't block much but may burn a lot of CPU cycles. * Make it highpri CPU intensive wq with max concurrency of 1. @@ -781,14 +749,10 @@ void __init bio_integrity_init(void) if (!kintegrityd_wq) panic("Failed to create kintegrityd\n"); - for (i = 0 ; i < BIOVEC_NR_POOLS ; i++) { - unsigned int size; - - size = sizeof(struct bio_integrity_payload) - + bip_slab[i].nr_vecs * sizeof(struct bio_vec); - - bip_slab[i].slab = - kmem_cache_create(bip_slab[i].name, size, 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - } + bip_slab = kmem_cache_create("bio_integrity_payload", + sizeof(struct bio_integrity_payload) + + sizeof(struct bio_vec) * BIP_INLINE_VECS, + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + if (!bip_slab) + panic("Failed to create slab\n"); } @@ -19,6 +19,7 @@ #include <linux/swap.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/uio.h> #include <linux/iocontext.h> #include <linux/slab.h> #include <linux/init.h> @@ -160,12 +161,12 @@ unsigned int bvec_nr_vecs(unsigned short idx) return bvec_slabs[idx].nr_vecs; } -void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx) +void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx) { BIO_BUG_ON(idx >= BIOVEC_NR_POOLS); if (idx == BIOVEC_MAX_IDX) - mempool_free(bv, bs->bvec_pool); + mempool_free(bv, pool); else { struct biovec_slab *bvs = bvec_slabs + idx; @@ -173,8 +174,8 @@ void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx) } } -struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, - struct bio_set *bs) +struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, + mempool_t *pool) { struct bio_vec *bvl; @@ -210,7 +211,7 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, */ if (*idx == BIOVEC_MAX_IDX) { fallback: - bvl = mempool_alloc(bs->bvec_pool, gfp_mask); + bvl = mempool_alloc(pool, gfp_mask); } else { struct biovec_slab *bvs = bvec_slabs + *idx; gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO); @@ -252,8 +253,8 @@ static void bio_free(struct bio *bio) __bio_free(bio); if (bs) { - if (bio_has_allocated_vec(bio)) - bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio)); + if (bio_flagged(bio, BIO_OWNS_VEC)) + bvec_free(bs->bvec_pool, bio->bi_io_vec, BIO_POOL_IDX(bio)); /* * If we have front padding, adjust the bio pointer before freeing @@ -297,6 +298,54 @@ void bio_reset(struct bio *bio) } EXPORT_SYMBOL(bio_reset); +static void bio_alloc_rescue(struct work_struct *work) +{ + struct bio_set *bs = container_of(work, struct bio_set, rescue_work); + struct bio *bio; + + while (1) { + spin_lock(&bs->rescue_lock); + bio = bio_list_pop(&bs->rescue_list); + spin_unlock(&bs->rescue_lock); + + if (!bio) + break; + + generic_make_request(bio); + } +} + +static void punt_bios_to_rescuer(struct bio_set *bs) +{ + struct bio_list punt, nopunt; + struct bio *bio; + + /* + * In order to guarantee forward progress we must punt only bios that + * were allocated from this bio_set; otherwise, if there was a bio on + * there for a stacking driver higher up in the stack, processing it + * could require allocating bios from this bio_set, and doing that from + * our own rescuer would be bad. + * + * Since bio lists are singly linked, pop them all instead of trying to + * remove from the middle of the list: + */ + + bio_list_init(&punt); + bio_list_init(&nopunt); + + while ((bio = bio_list_pop(current->bio_list))) + bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); + + *current->bio_list = nopunt; + + spin_lock(&bs->rescue_lock); + bio_list_merge(&bs->rescue_list, &punt); + spin_unlock(&bs->rescue_lock); + + queue_work(bs->rescue_workqueue, &bs->rescue_work); +} + /** * bio_alloc_bioset - allocate a bio for I/O * @gfp_mask: the GFP_ mask given to the slab allocator @@ -314,11 +363,27 @@ EXPORT_SYMBOL(bio_reset); * previously allocated bio for IO before attempting to allocate a new one. * Failure to do so can cause deadlocks under memory pressure. * + * Note that when running under generic_make_request() (i.e. any block + * driver), bios are not submitted until after you return - see the code in + * generic_make_request() that converts recursion into iteration, to prevent + * stack overflows. + * + * This would normally mean allocating multiple bios under + * generic_make_request() would be susceptible to deadlocks, but we have + * deadlock avoidance code that resubmits any blocked bios from a rescuer + * thread. + * + * However, we do not guarantee forward progress for allocations from other + * mempools. Doing multiple allocations from the same mempool under + * generic_make_request() should be avoided - instead, use bio_set's front_pad + * for per bio allocations. + * * RETURNS: * Pointer to new bio on success, NULL on failure. */ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) { + gfp_t saved_gfp = gfp_mask; unsigned front_pad; unsigned inline_vecs; unsigned long idx = BIO_POOL_NONE; @@ -336,7 +401,37 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) front_pad = 0; inline_vecs = nr_iovecs; } else { + /* + * generic_make_request() converts recursion to iteration; this + * means if we're running beneath it, any bios we allocate and + * submit will not be submitted (and thus freed) until after we + * return. + * + * This exposes us to a potential deadlock if we allocate + * multiple bios from the same bio_set() while running + * underneath generic_make_request(). If we were to allocate + * multiple bios (say a stacking block driver that was splitting + * bios), we would deadlock if we exhausted the mempool's + * reserve. + * + * We solve this, and guarantee forward progress, with a rescuer + * workqueue per bio_set. If we go to allocate and there are + * bios on current->bio_list, we first try the allocation + * without __GFP_WAIT; if that fails, we punt those bios we + * would be blocking to the rescuer workqueue before we retry + * with the original gfp_flags. + */ + + if (current->bio_list && !bio_list_empty(current->bio_list)) + gfp_mask &= ~__GFP_WAIT; + p = mempool_alloc(bs->bio_pool, gfp_mask); + if (!p && gfp_mask != saved_gfp) { + punt_bios_to_rescuer(bs); + gfp_mask = saved_gfp; + p = mempool_alloc(bs->bio_pool, gfp_mask); + } + front_pad = bs->front_pad; inline_vecs = BIO_INLINE_VECS; } @@ -348,9 +443,17 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) bio_init(bio); if (nr_iovecs > inline_vecs) { - bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs); + bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool); + if (!bvl && gfp_mask != saved_gfp) { + punt_bios_to_rescuer(bs); + gfp_mask = saved_gfp; + bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool); + } + if (unlikely(!bvl)) goto err_free; + + bio->bi_flags |= 1 << BIO_OWNS_VEC; } else if (nr_iovecs) { bvl = bio->bi_inline_vecs; } @@ -652,6 +755,181 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len, } EXPORT_SYMBOL(bio_add_page); +struct submit_bio_ret { + struct completion event; + int error; +}; + +static void submit_bio_wait_endio(struct bio *bio, int error) +{ + struct submit_bio_ret *ret = bio->bi_private; + + ret->error = error; + complete(&ret->event); +} + +/** + * submit_bio_wait - submit a bio, and wait until it completes + * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) + * @bio: The &struct bio which describes the I/O + * + * Simple wrapper around submit_bio(). Returns 0 on success, or the error from + * bio_endio() on failure. + */ +int submit_bio_wait(int rw, struct bio *bio) +{ + struct submit_bio_ret ret; + + rw |= REQ_SYNC; + init_completion(&ret.event); + bio->bi_private = &ret; + bio->bi_end_io = submit_bio_wait_endio; + submit_bio(rw, bio); + wait_for_completion(&ret.event); + + return ret.error; +} +EXPORT_SYMBOL(submit_bio_wait); + +/** + * bio_advance - increment/complete a bio by some number of bytes + * @bio: bio to advance + * @bytes: number of bytes to complete + * + * This updates bi_sector, bi_size and bi_idx; if the number of bytes to + * complete doesn't align with a bvec boundary, then bv_len and bv_offset will + * be updated on the last bvec as well. + * + * @bio will then represent the remaining, uncompleted portion of the io. + */ +void bio_advance(struct bio *bio, unsigned bytes) +{ + if (bio_integrity(bio)) + bio_integrity_advance(bio, bytes); + + bio->bi_sector += bytes >> 9; + bio->bi_size -= bytes; + + if (bio->bi_rw & BIO_NO_ADVANCE_ITER_MASK) + return; + + while (bytes) { + if (unlikely(bio->bi_idx >= bio->bi_vcnt)) { + WARN_ONCE(1, "bio idx %d >= vcnt %d\n", + bio->bi_idx, bio->bi_vcnt); + break; + } + + if (bytes >= bio_iovec(bio)->bv_len) { + bytes -= bio_iovec(bio)->bv_len; + bio->bi_idx++; + } else { + bio_iovec(bio)->bv_len -= bytes; + bio_iovec(bio)->bv_offset += bytes; + bytes = 0; + } + } +} +EXPORT_SYMBOL(bio_advance); + +/** + * bio_alloc_pages - allocates a single page for each bvec in a bio + * @bio: bio to allocate pages for + * @gfp_mask: flags for allocation + * + * Allocates pages up to @bio->bi_vcnt. + * + * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are + * freed. + */ +int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask) +{ + int i; + struct bio_vec *bv; + + bio_for_each_segment_all(bv, bio, i) { + bv->bv_page = alloc_page(gfp_mask); + if (!bv->bv_page) { + while (--bv >= bio->bi_io_vec) + __free_page(bv->bv_page); + return -ENOMEM; + } + } + + return 0; +} +EXPORT_SYMBOL(bio_alloc_pages); + +/** + * bio_copy_data - copy contents of data buffers from one chain of bios to + * another + * @src: source bio list + * @dst: destination bio list + * + * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats + * @src and @dst as linked lists of bios. + * + * Stops when it reaches the end of either @src or @dst - that is, copies + * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios). + */ +void bio_copy_data(struct bio *dst, struct bio *src) +{ + struct bio_vec *src_bv, *dst_bv; + unsigned src_offset, dst_offset, bytes; + void *src_p, *dst_p; + + src_bv = bio_iovec(src); + dst_bv = bio_iovec(dst); + + src_offset = src_bv->bv_offset; + dst_offset = dst_bv->bv_offset; + + while (1) { + if (src_offset == src_bv->bv_offset + src_bv->bv_len) { + src_bv++; + if (src_bv == bio_iovec_idx(src, src->bi_vcnt)) { + src = src->bi_next; + if (!src) + break; + + src_bv = bio_iovec(src); + } + + src_offset = src_bv->bv_offset; + } + + if (dst_offset == dst_bv->bv_offset + dst_bv->bv_len) { + dst_bv++; + if (dst_bv == bio_iovec_idx(dst, dst->bi_vcnt)) { + dst = dst->bi_next; + if (!dst) + break; + + dst_bv = bio_iovec(dst); + } + + dst_offset = dst_bv->bv_offset; + } + + bytes = min(dst_bv->bv_offset + dst_bv->bv_len - dst_offset, + src_bv->bv_offset + src_bv->bv_len - src_offset); + + src_p = kmap_atomic(src_bv->bv_page); + dst_p = kmap_atomic(dst_bv->bv_page); + + memcpy(dst_p + dst_bv->bv_offset, + src_p + src_bv->bv_offset, + bytes); + + kunmap_atomic(dst_p); + kunmap_atomic(src_p); + + src_offset += bytes; + dst_offset += bytes; + } +} +EXPORT_SYMBOL(bio_copy_data); + struct bio_map_data { struct bio_vec *iovecs; struct sg_iovec *sgvecs; @@ -714,7 +992,7 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs, int iov_idx = 0; unsigned int iov_off = 0; - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment_all(bvec, bio, i) { char *bv_addr = page_address(bvec->bv_page); unsigned int bv_len = iovecs[i].bv_len; @@ -896,7 +1174,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, return bio; cleanup: if (!map_data) - bio_for_each_segment(bvec, bio, i) + bio_for_each_segment_all(bvec, bio, i) __free_page(bvec->bv_page); bio_put(bio); @@ -1110,7 +1388,7 @@ static void __bio_unmap_user(struct bio *bio) /* * make sure we dirty pages we wrote to */ - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment_all(bvec, bio, i) { if (bio_data_dir(bio) == READ) set_page_dirty_lock(bvec->bv_page); @@ -1216,7 +1494,7 @@ static void bio_copy_kern_endio(struct bio *bio, int err) int i; char *p = bmd->sgvecs[0].iov_base; - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment_all(bvec, bio, i) { char *addr = page_address(bvec->bv_page); int len = bmd->iovecs[i].bv_len; @@ -1256,7 +1534,7 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, if (!reading) { void *p = data; - bio_for_each_segment(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i) { char *addr = page_address(bvec->bv_page); memcpy(addr, p, bvec->bv_len); @@ -1301,11 +1579,11 @@ EXPORT_SYMBOL(bio_copy_kern); */ void bio_set_pages_dirty(struct bio *bio) { - struct bio_vec *bvec = bio->bi_io_vec; + struct bio_vec *bvec; int i; - for (i = 0; i < bio->bi_vcnt; i++) { - struct page *page = bvec[i].bv_page; + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; if (page && !PageCompound(page)) set_page_dirty_lock(page); @@ -1314,11 +1592,11 @@ void bio_set_pages_dirty(struct bio *bio) static void bio_release_pages(struct bio *bio) { - struct bio_vec *bvec = bio->bi_io_vec; + struct bio_vec *bvec; int i; - for (i = 0; i < bio->bi_vcnt; i++) { - struct page *page = bvec[i].bv_page; + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; if (page) put_page(page); @@ -1367,16 +1645,16 @@ static void bio_dirty_fn(struct work_struct *work) void bio_check_pages_dirty(struct bio *bio) { - struct bio_vec *bvec = bio->bi_io_vec; + struct bio_vec *bvec; int nr_clean_pages = 0; int i; - for (i = 0; i < bio->bi_vcnt; i++) { - struct page *page = bvec[i].bv_page; + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; if (PageDirty(page) || PageCompound(page)) { page_cache_release(page); - bvec[i].bv_page = NULL; + bvec->bv_page = NULL; } else { nr_clean_pages++; } @@ -1428,8 +1706,6 @@ void bio_endio(struct bio *bio, int error) else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) error = -EIO; - trace_block_bio_complete(bio, error); - if (bio->bi_end_io) bio->bi_end_io(bio, error); } @@ -1479,8 +1755,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors) trace_block_split(bdev_get_queue(bi->bi_bdev), bi, bi->bi_sector + first_sectors); - BUG_ON(bi->bi_vcnt != 1 && bi->bi_vcnt != 0); - BUG_ON(bi->bi_idx != 0); + BUG_ON(bio_segments(bi) > 1); atomic_set(&bp->cnt, 3); bp->error = 0; bp->bio1 = *bi; @@ -1490,8 +1765,8 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors) bp->bio1.bi_size = first_sectors << 9; if (bi->bi_vcnt != 0) { - bp->bv1 = bi->bi_io_vec[0]; - bp->bv2 = bi->bi_io_vec[0]; + bp->bv1 = *bio_iovec(bi); + bp->bv2 = *bio_iovec(bi); if (bio_is_rw(bi)) { bp->bv2.bv_offset += first_sectors << 9; @@ -1543,7 +1818,7 @@ sector_t bio_sector_offset(struct bio *bio, unsigned short index, if (index >= bio->bi_idx) index = bio->bi_vcnt - 1; - __bio_for_each_segment(bv, bio, i, 0) { + bio_for_each_segment_all(bv, bio, i) { if (i == index) { if (offset > bv->bv_offset) sectors += (offset - bv->bv_offset) / sector_sz; @@ -1561,29 +1836,25 @@ EXPORT_SYMBOL(bio_sector_offset); * create memory pools for biovec's in a bio_set. * use the global biovec slabs created for general use. */ -static int biovec_create_pools(struct bio_set *bs, int pool_entries) +mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries) { struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX; - bs->bvec_pool = mempool_create_slab_pool(pool_entries, bp->slab); - if (!bs->bvec_pool) - return -ENOMEM; - - return 0; -} - -static void biovec_free_pools(struct bio_set *bs) -{ - mempool_destroy(bs->bvec_pool); + return mempool_create_slab_pool(pool_entries, bp->slab); } void bioset_free(struct bio_set *bs) { + if (bs->rescue_workqueue) + destroy_workqueue(bs->rescue_workqueue); + if (bs->bio_pool) mempool_destroy(bs->bio_pool); + if (bs->bvec_pool) + mempool_destroy(bs->bvec_pool); + bioset_integrity_free(bs); - biovec_free_pools(bs); bio_put_slab(bs); kfree(bs); @@ -1614,6 +1885,10 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) bs->front_pad = front_pad; + spin_lock_init(&bs->rescue_lock); + bio_list_init(&bs->rescue_list); + INIT_WORK(&bs->rescue_work, bio_alloc_rescue); + bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad); if (!bs->bio_slab) { kfree(bs); @@ -1624,9 +1899,15 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) if (!bs->bio_pool) goto bad; - if (!biovec_create_pools(bs, pool_size)) - return bs; + bs->bvec_pool = biovec_create_pool(bs, pool_size); + if (!bs->bvec_pool) + goto bad; + + bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0); + if (!bs->rescue_workqueue) + goto bad; + return bs; bad: bioset_free(bs); return NULL; diff --git a/fs/block_dev.c b/fs/block_dev.c index aea605c98ba6..2091db8cdd78 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -27,6 +27,7 @@ #include <linux/namei.h> #include <linux/log2.h> #include <linux/cleancache.h> +#include <linux/aio.h> #include <asm/uaccess.h> #include "internal.h" @@ -551,6 +552,7 @@ struct block_device *bdgrab(struct block_device *bdev) ihold(bdev->bd_inode); return bdev; } +EXPORT_SYMBOL(bdgrab); long nr_blockdev_pages(void) { @@ -616,11 +618,9 @@ void bd_forget(struct inode *inode) struct block_device *bdev = NULL; spin_lock(&bdev_lock); - if (inode->i_bdev) { - if (!sb_is_blkdev_sb(inode->i_sb)) - bdev = inode->i_bdev; - __bd_forget(inode); - } + if (!sb_is_blkdev_sb(inode->i_sb)) + bdev = inode->i_bdev; + __bd_forget(inode); spin_unlock(&bdev_lock); if (bdev) @@ -1046,7 +1046,7 @@ void bd_set_size(struct block_device *bdev, loff_t size) } EXPORT_SYMBOL(bd_set_size); -static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); +static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); /* * bd_mutex locking: @@ -1401,9 +1401,8 @@ static int blkdev_open(struct inode * inode, struct file * filp) return blkdev_get(bdev, filp->f_mode, filp); } -static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) +static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) { - int ret = 0; struct gendisk *disk = bdev->bd_disk; struct block_device *victim = NULL; @@ -1423,7 +1422,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) } if (bdev->bd_contains == bdev) { if (disk->fops->release) - ret = disk->fops->release(disk, mode); + disk->fops->release(disk, mode); } if (!bdev->bd_openers) { struct module *owner = disk->fops->owner; @@ -1442,10 +1441,9 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) bdput(bdev); if (victim) __blkdev_put(victim, mode, 1); - return ret; } -int blkdev_put(struct block_device *bdev, fmode_t mode) +void blkdev_put(struct block_device *bdev, fmode_t mode) { mutex_lock(&bdev->bd_mutex); @@ -1489,15 +1487,15 @@ int blkdev_put(struct block_device *bdev, fmode_t mode) mutex_unlock(&bdev->bd_mutex); - return __blkdev_put(bdev, mode, 0); + __blkdev_put(bdev, mode, 0); } EXPORT_SYMBOL(blkdev_put); static int blkdev_close(struct inode * inode, struct file * filp) { struct block_device *bdev = I_BDEV(filp->f_mapping->host); - - return blkdev_put(bdev, filp->f_mode); + blkdev_put(bdev, filp->f_mode); + return 0; } static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) @@ -1558,7 +1556,7 @@ static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, return 0; size -= pos; - if (size < INT_MAX) + if (size < iocb->ki_left) nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size); return generic_file_aio_read(iocb, iov, nr_segs, pos); } diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index ecd25a1b4e51..ca9d8f1a3bb6 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -651,6 +651,8 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, if (tree_mod_dont_log(fs_info, NULL)) return 0; + __tree_mod_log_free_eb(fs_info, old_root); + ret = tree_mod_alloc(fs_info, flags, &tm); if (ret < 0) goto out; @@ -736,7 +738,7 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) static noinline void tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, struct extent_buffer *src, unsigned long dst_offset, - unsigned long src_offset, int nr_items) + unsigned long src_offset, int nr_items, int log_removal) { int ret; int i; @@ -750,10 +752,12 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, } for (i = 0; i < nr_items; i++) { - ret = tree_mod_log_insert_key_locked(fs_info, src, - i + src_offset, - MOD_LOG_KEY_REMOVE); - BUG_ON(ret < 0); + if (log_removal) { + ret = tree_mod_log_insert_key_locked(fs_info, src, + i + src_offset, + MOD_LOG_KEY_REMOVE); + BUG_ON(ret < 0); + } ret = tree_mod_log_insert_key_locked(fs_info, dst, i + dst_offset, MOD_LOG_KEY_ADD); @@ -927,7 +931,6 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, ret = btrfs_dec_ref(trans, root, buf, 1, 1); BUG_ON(ret); /* -ENOMEM */ } - tree_mod_log_free_eb(root->fs_info, buf); clean_tree_block(trans, root, buf); *last_ref = 1; } @@ -1046,6 +1049,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); btrfs_mark_buffer_dirty(parent); + tree_mod_log_free_eb(root->fs_info, buf); btrfs_free_tree_block(trans, root, buf, parent_start, last_ref); } @@ -1750,7 +1754,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto enospc; } - tree_mod_log_free_eb(root->fs_info, root->node); tree_mod_log_set_root_pointer(root, child); rcu_assign_pointer(root->node, child); @@ -2995,7 +2998,7 @@ static int push_node_left(struct btrfs_trans_handle *trans, push_items = min(src_nritems - 8, push_items); tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, - push_items); + push_items, 1); copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(dst_nritems), btrfs_node_key_ptr_offset(0), @@ -3066,7 +3069,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, sizeof(struct btrfs_key_ptr)); tree_mod_log_eb_copy(root->fs_info, dst, src, 0, - src_nritems - push_items, push_items); + src_nritems - push_items, push_items, 1); copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(src_nritems - push_items), @@ -3218,12 +3221,18 @@ static noinline int split_node(struct btrfs_trans_handle *trans, int mid; int ret; u32 c_nritems; + int tree_mod_log_removal = 1; c = path->nodes[level]; WARN_ON(btrfs_header_generation(c) != trans->transid); if (c == root->node) { /* trying to split the root, lets make a new one */ ret = insert_new_root(trans, root, path, level + 1); + /* + * removal of root nodes has been logged by + * tree_mod_log_set_root_pointer due to locking + */ + tree_mod_log_removal = 0; if (ret) return ret; } else { @@ -3261,7 +3270,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, (unsigned long)btrfs_header_chunk_tree_uuid(split), BTRFS_UUID_SIZE); - tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid); + tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid, + tree_mod_log_removal); copy_extent_buffer(split, c, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(mid), diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 0b278b117cbe..14fce27b4780 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -22,8 +22,9 @@ #include "disk-io.h" #include "transaction.h" -#define BTRFS_DELAYED_WRITEBACK 400 -#define BTRFS_DELAYED_BACKGROUND 100 +#define BTRFS_DELAYED_WRITEBACK 512 +#define BTRFS_DELAYED_BACKGROUND 128 +#define BTRFS_DELAYED_BATCH 16 static struct kmem_cache *delayed_node_cache; @@ -494,6 +495,15 @@ static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node, BTRFS_DELAYED_DELETION_ITEM); } +static void finish_one_item(struct btrfs_delayed_root *delayed_root) +{ + int seq = atomic_inc_return(&delayed_root->items_seq); + if ((atomic_dec_return(&delayed_root->items) < + BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) && + waitqueue_active(&delayed_root->wait)) + wake_up(&delayed_root->wait); +} + static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) { struct rb_root *root; @@ -512,10 +522,8 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) rb_erase(&delayed_item->rb_node, root); delayed_item->delayed_node->count--; - if (atomic_dec_return(&delayed_root->items) < - BTRFS_DELAYED_BACKGROUND && - waitqueue_active(&delayed_root->wait)) - wake_up(&delayed_root->wait); + + finish_one_item(delayed_root); } static void btrfs_release_delayed_item(struct btrfs_delayed_item *item) @@ -1056,10 +1064,7 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) delayed_node->count--; delayed_root = delayed_node->root->fs_info->delayed_root; - if (atomic_dec_return(&delayed_root->items) < - BTRFS_DELAYED_BACKGROUND && - waitqueue_active(&delayed_root->wait)) - wake_up(&delayed_root->wait); + finish_one_item(delayed_root); } } @@ -1304,35 +1309,44 @@ void btrfs_remove_delayed_node(struct inode *inode) btrfs_release_delayed_node(delayed_node); } -struct btrfs_async_delayed_node { - struct btrfs_root *root; - struct btrfs_delayed_node *delayed_node; +struct btrfs_async_delayed_work { + struct btrfs_delayed_root *delayed_root; + int nr; struct btrfs_work work; }; -static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) +static void btrfs_async_run_delayed_root(struct btrfs_work *work) { - struct btrfs_async_delayed_node *async_node; + struct btrfs_async_delayed_work *async_work; + struct btrfs_delayed_root *delayed_root; struct btrfs_trans_handle *trans; struct btrfs_path *path; struct btrfs_delayed_node *delayed_node = NULL; struct btrfs_root *root; struct btrfs_block_rsv *block_rsv; - int need_requeue = 0; + int total_done = 0; - async_node = container_of(work, struct btrfs_async_delayed_node, work); + async_work = container_of(work, struct btrfs_async_delayed_work, work); + delayed_root = async_work->delayed_root; path = btrfs_alloc_path(); if (!path) goto out; - path->leave_spinning = 1; - delayed_node = async_node->delayed_node; +again: + if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND / 2) + goto free_path; + + delayed_node = btrfs_first_prepared_delayed_node(delayed_root); + if (!delayed_node) + goto free_path; + + path->leave_spinning = 1; root = delayed_node->root; trans = btrfs_join_transaction(root); if (IS_ERR(trans)) - goto free_path; + goto release_path; block_rsv = trans->block_rsv; trans->block_rsv = &root->fs_info->delayed_block_rsv; @@ -1363,57 +1377,47 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) * Task1 will sleep until the transaction is commited. */ mutex_lock(&delayed_node->mutex); - if (delayed_node->count) - need_requeue = 1; - else - btrfs_dequeue_delayed_node(root->fs_info->delayed_root, - delayed_node); + btrfs_dequeue_delayed_node(root->fs_info->delayed_root, delayed_node); mutex_unlock(&delayed_node->mutex); trans->block_rsv = block_rsv; btrfs_end_transaction_dmeta(trans, root); btrfs_btree_balance_dirty_nodelay(root); + +release_path: + btrfs_release_path(path); + total_done++; + + btrfs_release_prepared_delayed_node(delayed_node); + if (async_work->nr == 0 || total_done < async_work->nr) + goto again; + free_path: btrfs_free_path(path); out: - if (need_requeue) - btrfs_requeue_work(&async_node->work); - else { - btrfs_release_prepared_delayed_node(delayed_node); - kfree(async_node); - } + wake_up(&delayed_root->wait); + kfree(async_work); } + static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, - struct btrfs_root *root, int all) + struct btrfs_root *root, int nr) { - struct btrfs_async_delayed_node *async_node; - struct btrfs_delayed_node *curr; - int count = 0; + struct btrfs_async_delayed_work *async_work; -again: - curr = btrfs_first_prepared_delayed_node(delayed_root); - if (!curr) + if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) return 0; - async_node = kmalloc(sizeof(*async_node), GFP_NOFS); - if (!async_node) { - btrfs_release_prepared_delayed_node(curr); + async_work = kmalloc(sizeof(*async_work), GFP_NOFS); + if (!async_work) return -ENOMEM; - } - - async_node->root = root; - async_node->delayed_node = curr; - - async_node->work.func = btrfs_async_run_delayed_node_done; - async_node->work.flags = 0; - btrfs_queue_worker(&root->fs_info->delayed_workers, &async_node->work); - count++; - - if (all || count < 4) - goto again; + async_work->delayed_root = delayed_root; + async_work->work.func = btrfs_async_run_delayed_root; + async_work->work.flags = 0; + async_work->nr = nr; + btrfs_queue_worker(&root->fs_info->delayed_workers, &async_work->work); return 0; } @@ -1424,30 +1428,55 @@ void btrfs_assert_delayed_root_empty(struct btrfs_root *root) WARN_ON(btrfs_first_delayed_node(delayed_root)); } +static int refs_newer(struct btrfs_delayed_root *delayed_root, + int seq, int count) +{ + int val = atomic_read(&delayed_root->items_seq); + + if (val < seq || val >= seq + count) + return 1; + return 0; +} + void btrfs_balance_delayed_items(struct btrfs_root *root) { struct btrfs_delayed_root *delayed_root; + int seq; delayed_root = btrfs_get_delayed_root(root); if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) return; + seq = atomic_read(&delayed_root->items_seq); + if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) { int ret; - ret = btrfs_wq_run_delayed_node(delayed_root, root, 1); + DEFINE_WAIT(__wait); + + ret = btrfs_wq_run_delayed_node(delayed_root, root, 0); if (ret) return; - wait_event_interruptible_timeout( - delayed_root->wait, - (atomic_read(&delayed_root->items) < - BTRFS_DELAYED_BACKGROUND), - HZ); - return; + while (1) { + prepare_to_wait(&delayed_root->wait, &__wait, + TASK_INTERRUPTIBLE); + + if (refs_newer(delayed_root, seq, + BTRFS_DELAYED_BATCH) || + atomic_read(&delayed_root->items) < + BTRFS_DELAYED_BACKGROUND) { + break; + } + if (!signal_pending(current)) + schedule(); + else + break; + } + finish_wait(&delayed_root->wait, &__wait); } - btrfs_wq_run_delayed_node(delayed_root, root, 0); + btrfs_wq_run_delayed_node(delayed_root, root, BTRFS_DELAYED_BATCH); } /* Will return 0 or -ENOMEM */ diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 78b6ad0fc669..1d5c5f7abe3e 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -43,6 +43,7 @@ struct btrfs_delayed_root { */ struct list_head prepare_list; atomic_t items; /* for delayed items */ + atomic_t items_seq; /* for delayed items */ int nodes; /* for delayed nodes */ wait_queue_head_t wait; }; @@ -86,6 +87,7 @@ static inline void btrfs_init_delayed_root( struct btrfs_delayed_root *delayed_root) { atomic_set(&delayed_root->items, 0); + atomic_set(&delayed_root->items_seq, 0); delayed_root->nodes = 0; spin_lock_init(&delayed_root->lock); init_waitqueue_head(&delayed_root->wait); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 02369a3c162e..6d19a0a554aa 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -62,7 +62,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_root *root); -static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t); +static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t); static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root); static int btrfs_destroy_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, @@ -1291,6 +1291,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, 0, objectid, NULL, 0, 0, 0); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); + leaf = NULL; goto fail; } @@ -1334,11 +1335,16 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, btrfs_tree_unlock(leaf); + return root; + fail: - if (ret) - return ERR_PTR(ret); + if (leaf) { + btrfs_tree_unlock(leaf); + free_extent_buffer(leaf); + } + kfree(root); - return root; + return ERR_PTR(ret); } static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, @@ -3253,7 +3259,7 @@ void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) if (btrfs_root_refs(&root->root_item) == 0) synchronize_srcu(&fs_info->subvol_srcu); - if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { btrfs_free_log(NULL, root); btrfs_free_log_root_tree(NULL, fs_info); } @@ -3687,7 +3693,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, return ret; } -static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t) +static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t) { struct btrfs_pending_snapshot *snapshot; struct list_head splice; @@ -3700,10 +3706,8 @@ static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t) snapshot = list_entry(splice.next, struct btrfs_pending_snapshot, list); - + snapshot->error = -ECANCELED; list_del_init(&snapshot->list); - - kfree(snapshot); } } @@ -3840,6 +3844,8 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, cur_trans->blocked = 1; wake_up(&root->fs_info->transaction_blocked_wait); + btrfs_evict_pending_snapshots(cur_trans); + cur_trans->blocked = 0; wake_up(&root->fs_info->transaction_wait); @@ -3849,8 +3855,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, btrfs_destroy_delayed_inodes(root); btrfs_assert_delayed_root_empty(root); - btrfs_destroy_pending_snapshots(cur_trans); - btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages, EXTENT_DIRTY); btrfs_destroy_pinned_extent(root, @@ -3894,6 +3898,8 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) if (waitqueue_active(&root->fs_info->transaction_blocked_wait)) wake_up(&root->fs_info->transaction_blocked_wait); + btrfs_evict_pending_snapshots(t); + t->blocked = 0; smp_mb(); if (waitqueue_active(&root->fs_info->transaction_wait)) @@ -3907,8 +3913,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) btrfs_destroy_delayed_inodes(root); btrfs_assert_delayed_root_empty(root); - btrfs_destroy_pending_snapshots(t); - btrfs_destroy_delalloc_inodes(root); spin_lock(&root->fs_info->trans_lock); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3e074dab2d57..3d551231caba 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -257,7 +257,8 @@ static int exclude_super_stripes(struct btrfs_root *root, cache->bytes_super += stripe_len; ret = add_excluded_extent(root, cache->key.objectid, stripe_len); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + return ret; } for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { @@ -265,13 +266,17 @@ static int exclude_super_stripes(struct btrfs_root *root, ret = btrfs_rmap_block(&root->fs_info->mapping_tree, cache->key.objectid, bytenr, 0, &logical, &nr, &stripe_len); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + return ret; while (nr--) { cache->bytes_super += stripe_len; ret = add_excluded_extent(root, logical[nr], stripe_len); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + kfree(logical); + return ret; + } } kfree(logical); @@ -1467,8 +1472,11 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, if (ret && !insert) { err = -ENOENT; goto out; + } else if (ret) { + err = -EIO; + WARN_ON(1); + goto out; } - BUG_ON(ret); /* Corruption */ leaf = path->nodes[0]; item_size = btrfs_item_size_nr(leaf, path->slots[0]); @@ -4435,7 +4443,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) spin_lock(&sinfo->lock); spin_lock(&block_rsv->lock); - block_rsv->size = num_bytes; + block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024); num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + sinfo->bytes_reserved + sinfo->bytes_readonly + @@ -4790,14 +4798,49 @@ out_fail: * If the inodes csum_bytes is the same as the original * csum_bytes then we know we haven't raced with any free()ers * so we can just reduce our inodes csum bytes and carry on. - * Otherwise we have to do the normal free thing to account for - * the case that the free side didn't free up its reserve - * because of this outstanding reservation. */ - if (BTRFS_I(inode)->csum_bytes == csum_bytes) + if (BTRFS_I(inode)->csum_bytes == csum_bytes) { calc_csum_metadata_size(inode, num_bytes, 0); - else - to_free = calc_csum_metadata_size(inode, num_bytes, 0); + } else { + u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes; + u64 bytes; + + /* + * This is tricky, but first we need to figure out how much we + * free'd from any free-ers that occured during this + * reservation, so we reset ->csum_bytes to the csum_bytes + * before we dropped our lock, and then call the free for the + * number of bytes that were freed while we were trying our + * reservation. + */ + bytes = csum_bytes - BTRFS_I(inode)->csum_bytes; + BTRFS_I(inode)->csum_bytes = csum_bytes; + to_free = calc_csum_metadata_size(inode, bytes, 0); + + + /* + * Now we need to see how much we would have freed had we not + * been making this reservation and our ->csum_bytes were not + * artificially inflated. + */ + BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes; + bytes = csum_bytes - orig_csum_bytes; + bytes = calc_csum_metadata_size(inode, bytes, 0); + + /* + * Now reset ->csum_bytes to what it should be. If bytes is + * more than to_free then we would have free'd more space had we + * not had an artificially high ->csum_bytes, so we need to free + * the remainder. If bytes is the same or less then we don't + * need to do anything, the other free-ers did the correct + * thing. + */ + BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes; + if (bytes > to_free) + to_free = bytes - to_free; + else + to_free = 0; + } spin_unlock(&BTRFS_I(inode)->lock); if (dropped) to_free += btrfs_calc_trans_metadata_size(root, dropped); @@ -7944,7 +7987,17 @@ int btrfs_read_block_groups(struct btrfs_root *root) * info has super bytes accounted for, otherwise we'll think * we have more space than we actually do. */ - exclude_super_stripes(root, cache); + ret = exclude_super_stripes(root, cache); + if (ret) { + /* + * We may have excluded something, so call this just in + * case. + */ + free_excluded_extents(root, cache); + kfree(cache->free_space_ctl); + kfree(cache); + goto error; + } /* * check for two cases, either we are full, and therefore @@ -8086,7 +8139,17 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; - exclude_super_stripes(root, cache); + ret = exclude_super_stripes(root, cache); + if (ret) { + /* + * We may have excluded something, so call this just in + * case. + */ + free_excluded_extents(root, cache); + kfree(cache->free_space_ctl); + kfree(cache); + return ret; + } add_new_free_space(cache, root->fs_info, chunk_offset, chunk_offset + size); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f173c5af6461..73f2bfe3ac93 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1257,6 +1257,39 @@ int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) GFP_NOFS); } +int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(inode->i_mapping, index); + BUG_ON(!page); /* Pages should be in the extent_io_tree */ + clear_page_dirty_for_io(page); + page_cache_release(page); + index++; + } + return 0; +} + +int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(inode->i_mapping, index); + BUG_ON(!page); /* Pages should be in the extent_io_tree */ + account_page_redirty(page); + __set_page_dirty_nobuffers(page); + page_cache_release(page); + index++; + } + return 0; +} + /* * helper function to set both pages and extents in the tree writeback */ @@ -2527,8 +2560,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, if (old_compressed) contig = bio->bi_sector == sector; else - contig = bio->bi_sector + (bio->bi_size >> 9) == - sector; + contig = bio_end_sector(bio) == sector; if (prev_bio_flags != bio_flags || !contig || merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) || diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 6068a1985560..258c92156857 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -325,6 +325,8 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, unsigned long *map_len); int extent_range_uptodate(struct extent_io_tree *tree, u64 start, u64 end); +int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); +int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); int extent_clear_unlock_delalloc(struct inode *inode, struct extent_io_tree *tree, u64 start, u64 end, struct page *locked_page, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index ec160202be3e..c4628a201cb3 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -118,9 +118,11 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]); csums_in_item /= csum_size; - if (csum_offset >= csums_in_item) { + if (csum_offset == csums_in_item) { ret = -EFBIG; goto fail; + } else if (csum_offset > csums_in_item) { + goto fail; } } item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); @@ -728,7 +730,6 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, return -ENOMEM; sector_sum = sums->sums; - trans->adding_csums = 1; again: next_offset = (u64)-1; found_next = 0; @@ -899,7 +900,6 @@ next_sector: goto again; } out: - trans->adding_csums = 0; btrfs_free_path(path); return ret; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index af1d0605a5c1..bc4d54c465a0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -24,6 +24,7 @@ #include <linux/string.h> #include <linux/backing-dev.h> #include <linux/mpage.h> +#include <linux/aio.h> #include <linux/falloc.h> #include <linux/swap.h> #include <linux/writeback.h> @@ -591,6 +592,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, } compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); clear_bit(EXTENT_FLAG_PINNED, &em->flags); + clear_bit(EXTENT_FLAG_LOGGING, &flags); remove_extent_mapping(em_tree, em); if (no_splits) goto next; @@ -1513,8 +1515,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, size_t count, ocount; bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host); - sb_start_write(inode->i_sb); - mutex_lock(&inode->i_mutex); err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); @@ -1616,7 +1616,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, if (sync) atomic_dec(&BTRFS_I(inode)->sync_writers); out: - sb_end_write(inode->i_sb); current->backing_dev_info = NULL; return num_written ? num_written : err; } @@ -2141,6 +2140,7 @@ static long btrfs_fallocate(struct file *file, int mode, { struct inode *inode = file_inode(file); struct extent_state *cached_state = NULL; + struct btrfs_root *root = BTRFS_I(inode)->root; u64 cur_offset; u64 last_byte; u64 alloc_start; @@ -2168,6 +2168,11 @@ static long btrfs_fallocate(struct file *file, int mode, ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); if (ret) return ret; + if (root->fs_info->quota_enabled) { + ret = btrfs_qgroup_reserve(root, alloc_end - alloc_start); + if (ret) + goto out_reserve_fail; + } /* * wait for ordered IO before we have any locks. We'll loop again @@ -2271,6 +2276,9 @@ static long btrfs_fallocate(struct file *file, int mode, &cached_state, GFP_NOFS); out: mutex_unlock(&inode->i_mutex); + if (root->fs_info->quota_enabled) + btrfs_qgroup_free(root, alloc_end - alloc_start); +out_reserve_fail: /* Let go of our reservation. */ btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c226daefd65d..898da0a01e04 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -32,6 +32,7 @@ #include <linux/writeback.h> #include <linux/statfs.h> #include <linux/compat.h> +#include <linux/aio.h> #include <linux/bit_spinlock.h> #include <linux/xattr.h> #include <linux/posix_acl.h> @@ -353,6 +354,7 @@ static noinline int compress_file_range(struct inode *inode, int i; int will_compress; int compress_type = root->fs_info->compress_type; + int redirty = 0; /* if this is a small write inside eof, kick off a defrag */ if ((end - start + 1) < 16 * 1024 && @@ -415,6 +417,17 @@ again: if (BTRFS_I(inode)->force_compress) compress_type = BTRFS_I(inode)->force_compress; + /* + * we need to call clear_page_dirty_for_io on each + * page in the range. Otherwise applications with the file + * mmap'd can wander in and change the page contents while + * we are compressing them. + * + * If the compression fails for any reason, we set the pages + * dirty again later on. + */ + extent_range_clear_dirty_for_io(inode, start, end); + redirty = 1; ret = btrfs_compress_pages(compress_type, inode->i_mapping, start, total_compressed, pages, @@ -554,6 +567,8 @@ cleanup_and_bail_uncompressed: __set_page_dirty_nobuffers(locked_page); /* unlocked later on in the async handlers */ } + if (redirty) + extent_range_redirty_for_io(inode, start, end); add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0, BTRFS_COMPRESS_NONE); *num_added += 1; @@ -1743,8 +1758,10 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum *sum; list_for_each_entry(sum, list, list) { + trans->adding_csums = 1; btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root->fs_info->csum_root, sum); + trans->adding_csums = 0; } return 0; } @@ -2312,6 +2329,7 @@ again: key.type = BTRFS_EXTENT_DATA_KEY; key.offset = start; + path->leave_spinning = 1; if (merge) { struct btrfs_file_extent_item *fi; u64 extent_len; @@ -2368,6 +2386,7 @@ again: btrfs_mark_buffer_dirty(leaf); inode_add_bytes(inode, len); + btrfs_release_path(path); ret = btrfs_inc_extent_ref(trans, root, new->bytenr, new->disk_len, 0, @@ -2381,6 +2400,7 @@ again: ret = 1; out_free_path: btrfs_release_path(path); + path->leave_spinning = 0; btrfs_end_transaction(trans, root); out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end, @@ -3676,11 +3696,9 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, * 1 for the dir item * 1 for the dir index * 1 for the inode ref - * 1 for the inode ref in the tree log - * 2 for the dir entries in the log * 1 for the inode */ - trans = btrfs_start_transaction(root, 8); + trans = btrfs_start_transaction(root, 5); if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) return trans; @@ -8124,7 +8142,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items * should cover the worst case number of items we'll modify. */ - trans = btrfs_start_transaction(root, 20); + trans = btrfs_start_transaction(root, 11); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_notrans; @@ -8502,6 +8520,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, struct btrfs_key ins; u64 cur_offset = start; u64 i_size; + u64 cur_bytes; int ret = 0; bool own_trans = true; @@ -8516,8 +8535,9 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, } } - ret = btrfs_reserve_extent(trans, root, - min(num_bytes, 256ULL * 1024 * 1024), + cur_bytes = min(num_bytes, 256ULL * 1024 * 1024); + cur_bytes = max(cur_bytes, min_size); + ret = btrfs_reserve_extent(trans, root, cur_bytes, min_size, 0, *alloc_hint, &ins, 1); if (ret) { if (own_trans) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index c83086fdda05..2c02310ff2d9 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -527,6 +527,8 @@ fail: if (async_transid) { *async_transid = trans->transid; err = btrfs_commit_transaction_async(trans, root, 1); + if (err) + err = btrfs_commit_transaction(trans, root); } else { err = btrfs_commit_transaction(trans, root); } @@ -592,16 +594,14 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, *async_transid = trans->transid; ret = btrfs_commit_transaction_async(trans, root->fs_info->extent_root, 1); + if (ret) + ret = btrfs_commit_transaction(trans, root); } else { ret = btrfs_commit_transaction(trans, root->fs_info->extent_root); } - if (ret) { - /* cleanup_transaction has freed this for us */ - if (trans->aborted) - pending_snapshot = NULL; + if (ret) goto fail; - } ret = pending_snapshot->error; if (ret) @@ -2245,13 +2245,6 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) if (ret) return ret; - if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, - 1)) { - pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); - mnt_drop_write_file(file); - return -EINVAL; - } - if (btrfs_root_readonly(root)) { ret = -EROFS; goto out; @@ -2306,7 +2299,6 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) ret = -EINVAL; } out: - atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); mnt_drop_write_file(file); return ret; } diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index ca52681e5f40..b81e0e9a4894 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -26,7 +26,6 @@ void btrfs_tree_lock(struct extent_buffer *eb); void btrfs_tree_unlock(struct extent_buffer *eb); -int btrfs_try_spin_lock(struct extent_buffer *eb); void btrfs_tree_read_lock(struct extent_buffer *eb); void btrfs_tree_read_unlock(struct extent_buffer *eb); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index dc08d77b717e..005c45db699e 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -557,6 +557,7 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) INIT_LIST_HEAD(&splice); INIT_LIST_HEAD(&works); + mutex_lock(&root->fs_info->ordered_operations_mutex); spin_lock(&root->fs_info->ordered_extent_lock); list_splice_init(&root->fs_info->ordered_extents, &splice); while (!list_empty(&splice)) { @@ -600,6 +601,7 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) cond_resched(); } + mutex_unlock(&root->fs_info->ordered_operations_mutex); } /* diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index aee4b1cc3d98..b44124dd2370 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1153,7 +1153,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, sgn > 0 ? node->seq - 1 : node->seq, &roots); if (ret < 0) - goto out; + return ret; spin_lock(&fs_info->qgroup_lock); quota_root = fs_info->quota_root; @@ -1275,7 +1275,6 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, ret = 0; unlock: spin_unlock(&fs_info->qgroup_lock); -out: ulist_free(roots); ulist_free(tmp); @@ -1525,21 +1524,23 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && qg->reserved + qg->rfer + num_bytes > - qg->max_rfer) + qg->max_rfer) { ret = -EDQUOT; + goto out; + } if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) && qg->reserved + qg->excl + num_bytes > - qg->max_excl) + qg->max_excl) { ret = -EDQUOT; + goto out; + } list_for_each_entry(glist, &qg->groups, next_group) { ulist_add(ulist, glist->group->qgroupid, (uintptr_t)glist->group, GFP_ATOMIC); } } - if (ret) - goto out; /* * no limits exceeded, now record the reservation into all qgroups diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 50695dc5e2ab..b67171e6d688 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1269,6 +1269,8 @@ static int __update_reloc_root(struct btrfs_root *root, int del) } spin_unlock(&rc->reloc_root_tree.lock); + if (!node) + return 0; BUG_ON((struct btrfs_root *)node->data != root); if (!del) { @@ -2238,13 +2240,28 @@ again: } static noinline_for_stack +void free_reloc_roots(struct list_head *list) +{ + struct btrfs_root *reloc_root; + + while (!list_empty(list)) { + reloc_root = list_entry(list->next, struct btrfs_root, + root_list); + __update_reloc_root(reloc_root, 1); + free_extent_buffer(reloc_root->node); + free_extent_buffer(reloc_root->commit_root); + kfree(reloc_root); + } +} + +static noinline_for_stack int merge_reloc_roots(struct reloc_control *rc) { struct btrfs_root *root; struct btrfs_root *reloc_root; LIST_HEAD(reloc_roots); int found = 0; - int ret; + int ret = 0; again: root = rc->extent_root; @@ -2270,20 +2287,33 @@ again: BUG_ON(root->reloc_root != reloc_root); ret = merge_reloc_root(rc, root); - BUG_ON(ret); + if (ret) + goto out; } else { list_del_init(&reloc_root->root_list); } ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1); - BUG_ON(ret < 0); + if (ret < 0) { + if (list_empty(&reloc_root->root_list)) + list_add_tail(&reloc_root->root_list, + &reloc_roots); + goto out; + } } if (found) { found = 0; goto again; } +out: + if (ret) { + btrfs_std_error(root->fs_info, ret); + if (!list_empty(&reloc_roots)) + free_reloc_roots(&reloc_roots); + } + BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); - return 0; + return ret; } static void free_block_list(struct rb_root *blocks) @@ -2818,8 +2848,10 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, int err = 0; path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + if (!path) { + err = -ENOMEM; + goto out_path; + } rb_node = rb_first(blocks); while (rb_node) { @@ -2858,10 +2890,11 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, rb_node = rb_next(rb_node); } out: - free_block_list(blocks); err = finish_pending_nodes(trans, rc, path, err); btrfs_free_path(path); +out_path: + free_block_list(blocks); return err; } @@ -3698,7 +3731,15 @@ int prepare_to_relocate(struct reloc_control *rc) set_reloc_control(rc); trans = btrfs_join_transaction(rc->extent_root); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + unset_reloc_control(rc); + /* + * extent tree is not a ref_cow tree and has no reloc_root to + * cleanup. And callers are responsible to free the above + * block rsv. + */ + return PTR_ERR(trans); + } btrfs_commit_transaction(trans, rc->extent_root); return 0; } @@ -3730,7 +3771,11 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) while (1) { progress++; trans = btrfs_start_transaction(rc->extent_root, 0); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + trans = NULL; + break; + } restart: if (update_backref_cache(trans, &rc->backref_cache)) { btrfs_end_transaction(trans, rc->extent_root); @@ -4264,14 +4309,9 @@ int btrfs_recover_relocation(struct btrfs_root *root) out_free: kfree(rc); out: - while (!list_empty(&reloc_roots)) { - reloc_root = list_entry(reloc_roots.next, - struct btrfs_root, root_list); - list_del(&reloc_root->root_list); - free_extent_buffer(reloc_root->node); - free_extent_buffer(reloc_root->commit_root); - kfree(reloc_root); - } + if (!list_empty(&reloc_roots)) + free_reloc_roots(&reloc_roots); + btrfs_free_path(path); if (err == 0) { diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 53c3501fa4ca..85e072b956d5 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -542,7 +542,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) eb = path->nodes[0]; ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); item_size = btrfs_item_size_nr(eb, path->slots[0]); - btrfs_release_path(path); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { do { @@ -558,7 +557,9 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) ret < 0 ? -1 : ref_level, ret < 0 ? -1 : ref_root); } while (ret != 1); + btrfs_release_path(path); } else { + btrfs_release_path(path); swarn.path = path; swarn.dev = dev; iterate_extent_inodes(fs_info, found_key.objectid, diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f7a8b861058b..c85e7c6b4598 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -3945,12 +3945,10 @@ static int is_extent_unchanged(struct send_ctx *sctx, found_key.type != key.type) { key.offset += right_len; break; - } else { - if (found_key.offset != key.offset + right_len) { - /* Should really not happen */ - ret = -EIO; - goto out; - } + } + if (found_key.offset != key.offset + right_len) { + ret = 0; + goto out; } key = found_key; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 68a29a1ea068..f6b88595f858 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1558,6 +1558,7 @@ static struct file_system_type btrfs_fs_type = { .kill_sb = btrfs_kill_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("btrfs"); /* * used by btrfsctl to scan devices when no FS is mounted diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index e52da6fb1165..50767bbaad6c 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -625,14 +625,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; - /* - * the same root has to be passed to start_transaction and - * end_transaction. Subvolume quota depends on this. - */ - WARN_ON(trans->root != root); if (trans->qgroup_reserved) { - btrfs_qgroup_free(root, trans->qgroup_reserved); + /* + * the same root has to be passed here between start_transaction + * and end_transaction. Subvolume quota depends on this. + */ + btrfs_qgroup_free(trans->root, trans->qgroup_reserved); trans->qgroup_reserved = 0; } @@ -1052,7 +1051,12 @@ int btrfs_defrag_root(struct btrfs_root *root) /* * new snapshots need to be created at a very specific time in the - * transaction commit. This does the actual creation + * transaction commit. This does the actual creation. + * + * Note: + * If the error which may affect the commitment of the current transaction + * happens, we should return the error number. If the error which just affect + * the creation of the pending snapshots, just return 0. */ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, @@ -1071,7 +1075,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct extent_buffer *tmp; struct extent_buffer *old; struct timespec cur_time = CURRENT_TIME; - int ret; + int ret = 0; u64 to_reserve = 0; u64 index = 0; u64 objectid; @@ -1080,40 +1084,36 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { - ret = pending->error = -ENOMEM; - return ret; + pending->error = -ENOMEM; + return 0; } new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); if (!new_root_item) { - ret = pending->error = -ENOMEM; + pending->error = -ENOMEM; goto root_item_alloc_fail; } - ret = btrfs_find_free_objectid(tree_root, &objectid); - if (ret) { - pending->error = ret; + pending->error = btrfs_find_free_objectid(tree_root, &objectid); + if (pending->error) goto no_free_objectid; - } btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); if (to_reserve > 0) { - ret = btrfs_block_rsv_add(root, &pending->block_rsv, - to_reserve, - BTRFS_RESERVE_NO_FLUSH); - if (ret) { - pending->error = ret; + pending->error = btrfs_block_rsv_add(root, + &pending->block_rsv, + to_reserve, + BTRFS_RESERVE_NO_FLUSH); + if (pending->error) goto no_free_objectid; - } } - ret = btrfs_qgroup_inherit(trans, fs_info, root->root_key.objectid, - objectid, pending->inherit); - if (ret) { - pending->error = ret; + pending->error = btrfs_qgroup_inherit(trans, fs_info, + root->root_key.objectid, + objectid, pending->inherit); + if (pending->error) goto no_free_objectid; - } key.objectid = objectid; key.offset = (u64)-1; @@ -1141,7 +1141,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, dentry->d_name.len, 0); if (dir_item != NULL && !IS_ERR(dir_item)) { pending->error = -EEXIST; - goto fail; + goto dir_item_existed; } else if (IS_ERR(dir_item)) { ret = PTR_ERR(dir_item); btrfs_abort_transaction(trans, root, ret); @@ -1272,6 +1272,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (ret) btrfs_abort_transaction(trans, root, ret); fail: + pending->error = ret; +dir_item_existed: trans->block_rsv = rsv; trans->bytes_reserved = 0; no_free_objectid: @@ -1287,12 +1289,17 @@ root_item_alloc_fail: static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { - struct btrfs_pending_snapshot *pending; + struct btrfs_pending_snapshot *pending, *next; struct list_head *head = &trans->transaction->pending_snapshots; + int ret = 0; - list_for_each_entry(pending, head, list) - create_pending_snapshot(trans, fs_info, pending); - return 0; + list_for_each_entry_safe(pending, next, head, list) { + list_del(&pending->list); + ret = create_pending_snapshot(trans, fs_info, pending); + if (ret) + break; + } + return ret; } static void update_super_roots(struct btrfs_root *root) @@ -1448,6 +1455,13 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, root, err); spin_lock(&root->fs_info->trans_lock); + + if (list_empty(&cur_trans->list)) { + spin_unlock(&root->fs_info->trans_lock); + btrfs_end_transaction(trans, root); + return; + } + list_del_init(&cur_trans->list); if (cur_trans == root->fs_info->running_transaction) { root->fs_info->trans_no_join = 1; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c7ef569eb22a..ef96381569a4 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -317,6 +317,7 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans, unsigned long src_ptr; unsigned long dst_ptr; int overwrite_root = 0; + bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) overwrite_root = 1; @@ -326,6 +327,9 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans, /* look for the key in the destination tree */ ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret < 0) + return ret; + if (ret == 0) { char *src_copy; char *dst_copy; @@ -367,6 +371,30 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans, return 0; } + /* + * We need to load the old nbytes into the inode so when we + * replay the extents we've logged we get the right nbytes. + */ + if (inode_item) { + struct btrfs_inode_item *item; + u64 nbytes; + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + nbytes = btrfs_inode_nbytes(path->nodes[0], item); + item = btrfs_item_ptr(eb, slot, + struct btrfs_inode_item); + btrfs_set_inode_nbytes(eb, item, nbytes); + } + } else if (inode_item) { + struct btrfs_inode_item *item; + + /* + * New inode, set nbytes to 0 so that the nbytes comes out + * properly when we replay the extents. + */ + item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); + btrfs_set_inode_nbytes(eb, item, 0); } insert: btrfs_release_path(path); @@ -486,7 +514,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, int found_type; u64 extent_end; u64 start = key->offset; - u64 saved_nbytes; + u64 nbytes = 0; struct btrfs_file_extent_item *item; struct inode *inode = NULL; unsigned long size; @@ -496,10 +524,19 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, found_type = btrfs_file_extent_type(eb, item); if (found_type == BTRFS_FILE_EXTENT_REG || - found_type == BTRFS_FILE_EXTENT_PREALLOC) - extent_end = start + btrfs_file_extent_num_bytes(eb, item); - else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + found_type == BTRFS_FILE_EXTENT_PREALLOC) { + nbytes = btrfs_file_extent_num_bytes(eb, item); + extent_end = start + nbytes; + + /* + * We don't add to the inodes nbytes if we are prealloc or a + * hole. + */ + if (btrfs_file_extent_disk_bytenr(eb, item) == 0) + nbytes = 0; + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { size = btrfs_file_extent_inline_len(eb, item); + nbytes = btrfs_file_extent_ram_bytes(eb, item); extent_end = ALIGN(start + size, root->sectorsize); } else { ret = 0; @@ -548,7 +585,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - saved_nbytes = inode_get_bytes(inode); /* drop any overlapping extents */ ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); BUG_ON(ret); @@ -635,7 +671,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, BUG_ON(ret); } - inode_set_bytes(inode, saved_nbytes); + inode_add_bytes(inode, nbytes); ret = btrfs_update_inode(trans, root, inode); out: if (inode) @@ -1382,7 +1418,10 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, btrfs_release_path(path); if (ret == 0) { - btrfs_inc_nlink(inode); + if (!inode->i_nlink) + set_nlink(inode, 1); + else + btrfs_inc_nlink(inode); ret = btrfs_update_inode(trans, root, inode); } else if (ret == -EEXIST) { ret = 0; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 35bb2d4ed29f..678977226570 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -684,6 +684,12 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) __btrfs_close_devices(fs_devices); free_fs_devices(fs_devices); } + /* + * Wait for rcu kworkers under __btrfs_close_devices + * to finish all blkdev_puts so device is really + * free when umount is done. + */ + rcu_barrier(); return ret; } @@ -2379,7 +2385,11 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, return ret; trans = btrfs_start_transaction(root, 0); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_std_error(root->fs_info, ret); + return ret; + } lock_chunks(root); @@ -3050,7 +3060,8 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info) unset_balance_control(fs_info); ret = del_balance_item(fs_info->tree_root); - BUG_ON(ret); + if (ret) + btrfs_std_error(fs_info, ret); atomic_set(&fs_info->mutually_exclusive_operation_running, 0); } @@ -3230,6 +3241,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl, update_ioctl_balance_args(fs_info, 0, bargs); } + if ((ret && ret != -ECANCELED && ret != -ENOSPC) || + balance_need_close(fs_info)) { + __cancel_balance(fs_info); + } + wake_up(&fs_info->balance_wait_q); return ret; @@ -4919,7 +4935,18 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, em = lookup_extent_mapping(em_tree, chunk_start, 1); read_unlock(&em_tree->lock); - BUG_ON(!em || em->start != chunk_start); + if (!em) { + printk(KERN_ERR "btrfs: couldn't find em for chunk %Lu\n", + chunk_start); + return -EIO; + } + + if (em->start != chunk_start) { + printk(KERN_ERR "btrfs: bad chunk start, em=%Lu, wanted=%Lu\n", + em->start, chunk_start); + free_extent_map(em); + return -EIO; + } map = (struct map_lookup *)em->bdev; length = em->len; @@ -5150,7 +5177,7 @@ static int bio_size_ok(struct block_device *bdev, struct bio *bio, } prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; - if ((bio->bi_size >> 9) > max_sectors) + if (bio_sectors(bio) > max_sectors) return 0; if (!q->merge_bvec_fn) diff --git a/fs/buffer.c b/fs/buffer.c index b4dcb34c9635..d2a4d1bb2d57 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -865,8 +865,6 @@ try_again: /* Link the buffer to its page */ set_bh_page(bh, page, offset); - - init_buffer(bh, NULL, NULL); } return head; /* @@ -2949,7 +2947,7 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh) } } -int submit_bh(int rw, struct buffer_head * bh) +int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) { struct bio *bio; int ret = 0; @@ -2979,15 +2977,20 @@ int submit_bh(int rw, struct buffer_head * bh) bio->bi_io_vec[0].bv_offset = bh_offset(bh); bio->bi_vcnt = 1; - bio->bi_idx = 0; bio->bi_size = bh->b_size; bio->bi_end_io = end_bio_bh_io_sync; bio->bi_private = bh; + bio->bi_flags |= bio_flags; /* Take care of bh's that straddle the end of the device */ guard_bh_eod(rw, bio, bh); + if (buffer_meta(bh)) + rw |= REQ_META; + if (buffer_prio(bh)) + rw |= REQ_PRIO; + bio_get(bio); submit_bio(rw, bio); @@ -2997,6 +3000,12 @@ int submit_bh(int rw, struct buffer_head * bh) bio_put(bio); return ret; } +EXPORT_SYMBOL_GPL(_submit_bh); + +int submit_bh(int rw, struct buffer_head *bh) +{ + return _submit_bh(rw, bh, 0); +} EXPORT_SYMBOL(submit_bh); /** diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 480992259707..317f9ee9c991 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -962,12 +962,14 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page) } data = kmap(page); + file_start_write(file); old_fs = get_fs(); set_fs(KERNEL_DS); ret = file->f_op->write( file, (const void __user *) data, len, &pos); set_fs(old_fs); kunmap(page); + file_end_write(file); if (ret != len) ret = -EIO; } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index a60ea977af6f..3e68ac101040 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -236,15 +236,21 @@ static int ceph_readpage(struct file *filp, struct page *page) static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) { struct inode *inode = req->r_inode; + struct ceph_osd_data *osd_data; int rc = req->r_result; int bytes = le32_to_cpu(msg->hdr.data_len); + int num_pages; int i; dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); /* unlock all pages, zeroing any data we didn't read */ - for (i = 0; i < req->r_num_pages; i++, bytes -= PAGE_CACHE_SIZE) { - struct page *page = req->r_pages[i]; + osd_data = osd_req_op_extent_osd_data(req, 0); + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); + num_pages = calc_pages_for((u64)osd_data->alignment, + (u64)osd_data->length); + for (i = 0; i < num_pages; i++) { + struct page *page = osd_data->pages[i]; if (bytes < (int)PAGE_CACHE_SIZE) { /* zero (remainder of) page */ @@ -257,8 +263,9 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) SetPageUptodate(page); unlock_page(page); page_cache_release(page); + bytes -= PAGE_CACHE_SIZE; } - kfree(req->r_pages); + kfree(osd_data->pages); } static void ceph_unlock_page_vector(struct page **pages, int num_pages) @@ -279,6 +286,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) &ceph_inode_to_client(inode)->client->osdc; struct ceph_inode_info *ci = ceph_inode(inode); struct page *page = list_entry(page_list->prev, struct page, lru); + struct ceph_vino vino; struct ceph_osd_request *req; u64 off; u64 len; @@ -303,18 +311,17 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) len = nr_pages << PAGE_CACHE_SHIFT; dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages, off, len); - - req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), - off, &len, - CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, - NULL, 0, + vino = ceph_vino(inode); + req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, + 1, CEPH_OSD_OP_READ, + CEPH_OSD_FLAG_READ, NULL, ci->i_truncate_seq, ci->i_truncate_size, - NULL, false, 0); + false); if (IS_ERR(req)) return PTR_ERR(req); /* build page vector */ - nr_pages = len >> PAGE_CACHE_SHIFT; + nr_pages = calc_pages_for(0, len); pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS); ret = -ENOMEM; if (!pages) @@ -336,11 +343,12 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) } pages[i] = page; } - req->r_pages = pages; - req->r_num_pages = nr_pages; + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); req->r_callback = finish_read; req->r_inode = inode; + ceph_osdc_build_request(req, off, NULL, vino.snap, NULL); + dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); ret = ceph_osdc_start_request(osdc, req, false); if (ret < 0) @@ -373,7 +381,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT; - dout("readpages %p file %p nr_pages %d max %d\n", inode, file, nr_pages, + dout("readpages %p file %p nr_pages %d max %d\n", inode, + file, nr_pages, max); while (!list_empty(page_list)) { rc = start_read(inode, page_list, max); @@ -548,17 +557,23 @@ static void writepages_finish(struct ceph_osd_request *req, { struct inode *inode = req->r_inode; struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_osd_data *osd_data; unsigned wrote; struct page *page; + int num_pages; int i; struct ceph_snap_context *snapc = req->r_snapc; struct address_space *mapping = inode->i_mapping; int rc = req->r_result; - u64 bytes = le64_to_cpu(req->r_request_ops[0].extent.length); + u64 bytes = req->r_ops[0].extent.length; struct ceph_fs_client *fsc = ceph_inode_to_client(inode); long writeback_stat; unsigned issued = ceph_caps_issued(ci); + osd_data = osd_req_op_extent_osd_data(req, 0); + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); + num_pages = calc_pages_for((u64)osd_data->alignment, + (u64)osd_data->length); if (rc >= 0) { /* * Assume we wrote the pages we originally sent. The @@ -566,7 +581,7 @@ static void writepages_finish(struct ceph_osd_request *req, * raced with a truncation and was adjusted at the osd, * so don't believe the reply. */ - wrote = req->r_num_pages; + wrote = num_pages; } else { wrote = 0; mapping_set_error(mapping, rc); @@ -575,8 +590,8 @@ static void writepages_finish(struct ceph_osd_request *req, inode, rc, bytes, wrote); /* clean all pages */ - for (i = 0; i < req->r_num_pages; i++) { - page = req->r_pages[i]; + for (i = 0; i < num_pages; i++) { + page = osd_data->pages[i]; BUG_ON(!page); WARN_ON(!PageUptodate(page)); @@ -605,32 +620,34 @@ static void writepages_finish(struct ceph_osd_request *req, unlock_page(page); } dout("%p wrote+cleaned %d pages\n", inode, wrote); - ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc); + ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc); - ceph_release_pages(req->r_pages, req->r_num_pages); - if (req->r_pages_from_pool) - mempool_free(req->r_pages, + ceph_release_pages(osd_data->pages, num_pages); + if (osd_data->pages_from_pool) + mempool_free(osd_data->pages, ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); else - kfree(req->r_pages); + kfree(osd_data->pages); ceph_osdc_put_request(req); } -/* - * allocate a page vec, either directly, or if necessary, via a the - * mempool. we avoid the mempool if we can because req->r_num_pages - * may be less than the maximum write size. - */ -static void alloc_page_vec(struct ceph_fs_client *fsc, - struct ceph_osd_request *req) +static struct ceph_osd_request * +ceph_writepages_osd_request(struct inode *inode, u64 offset, u64 *len, + struct ceph_snap_context *snapc, int num_ops) { - req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages, - GFP_NOFS); - if (!req->r_pages) { - req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS); - req->r_pages_from_pool = 1; - WARN_ON(!req->r_pages); - } + struct ceph_fs_client *fsc; + struct ceph_inode_info *ci; + struct ceph_vino vino; + + fsc = ceph_inode_to_client(inode); + ci = ceph_inode(inode); + vino = ceph_vino(inode); + /* BUG_ON(vino.snap != CEPH_NOSNAP); */ + + return ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + vino, offset, len, num_ops, CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK, + snapc, ci->i_truncate_seq, ci->i_truncate_size, true); } /* @@ -653,7 +670,7 @@ static int ceph_writepages_start(struct address_space *mapping, unsigned wsize = 1 << inode->i_blkbits; struct ceph_osd_request *req = NULL; int do_sync; - u64 snap_size = 0; + u64 snap_size; /* * Include a 'sync' in the OSD request if this is a data @@ -699,6 +716,7 @@ static int ceph_writepages_start(struct address_space *mapping, retry: /* find oldest snap context with dirty data */ ceph_put_snap_context(snapc); + snap_size = 0; snapc = get_oldest_context(inode, &snap_size); if (!snapc) { /* hmm, why does writepages get called when there @@ -706,6 +724,8 @@ retry: dout(" no snap context with dirty data?\n"); goto out; } + if (snap_size == 0) + snap_size = i_size_read(inode); dout(" oldest snapc is %p seq %lld (%d snaps)\n", snapc, snapc->seq, snapc->num_snaps); if (last_snapc && snapc != last_snapc) { @@ -718,10 +738,14 @@ retry: last_snapc = snapc; while (!done && index <= end) { + int num_ops = do_sync ? 2 : 1; + struct ceph_vino vino; unsigned i; int first; pgoff_t next; int pvec_pages, locked_pages; + struct page **pages = NULL; + mempool_t *pool = NULL; /* Becomes non-null if mempool used */ struct page *page; int want; u64 offset, len; @@ -773,11 +797,8 @@ get_more_pages: dout("waiting on writeback %p\n", page); wait_on_page_writeback(page); } - if ((snap_size && page_offset(page) > snap_size) || - (!snap_size && - page_offset(page) > i_size_read(inode))) { - dout("%p page eof %llu\n", page, snap_size ? - snap_size : i_size_read(inode)); + if (page_offset(page) >= snap_size) { + dout("%p page eof %llu\n", page, snap_size); done = 1; unlock_page(page); break; @@ -805,22 +826,23 @@ get_more_pages: break; } - /* ok */ + /* + * We have something to write. If this is + * the first locked page this time through, + * allocate an osd request and a page array + * that it will use. + */ if (locked_pages == 0) { + size_t size; + + BUG_ON(pages); + /* prepare async write request */ - offset = (u64) page_offset(page); + offset = (u64)page_offset(page); len = wsize; - req = ceph_osdc_new_request(&fsc->client->osdc, - &ci->i_layout, - ceph_vino(inode), - offset, &len, - CEPH_OSD_OP_WRITE, - CEPH_OSD_FLAG_WRITE | - CEPH_OSD_FLAG_ONDISK, - snapc, do_sync, - ci->i_truncate_seq, - ci->i_truncate_size, - &inode->i_mtime, true, 0); + req = ceph_writepages_osd_request(inode, + offset, &len, snapc, + num_ops); if (IS_ERR(req)) { rc = PTR_ERR(req); @@ -828,11 +850,17 @@ get_more_pages: break; } - max_pages = req->r_num_pages; - - alloc_page_vec(fsc, req); req->r_callback = writepages_finish; req->r_inode = inode; + + max_pages = calc_pages_for(0, (u64)len); + size = max_pages * sizeof (*pages); + pages = kmalloc(size, GFP_NOFS); + if (!pages) { + pool = fsc->wb_pagevec_pool; + pages = mempool_alloc(pool, GFP_NOFS); + BUG_ON(!pages); + } } /* note position of first page in pvec */ @@ -850,7 +878,7 @@ get_more_pages: } set_page_writeback(page); - req->r_pages[locked_pages] = page; + pages[locked_pages] = page; locked_pages++; next = page->index + 1; } @@ -879,18 +907,27 @@ get_more_pages: pvec.nr -= i-first; } - /* submit the write */ - offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT; - len = min((snap_size ? snap_size : i_size_read(inode)) - offset, + /* Format the osd request message and submit the write */ + + offset = page_offset(pages[0]); + len = min(snap_size - offset, (u64)locked_pages << PAGE_CACHE_SHIFT); dout("writepages got %d pages at %llu~%llu\n", locked_pages, offset, len); - /* revise final length, page count */ - req->r_num_pages = locked_pages; - req->r_request_ops[0].extent.length = cpu_to_le64(len); - req->r_request_ops[0].payload_len = cpu_to_le32(len); - req->r_request->hdr.data_len = cpu_to_le32(len); + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, + !!pool, false); + + pages = NULL; /* request message now owns the pages array */ + pool = NULL; + + /* Update the write op length in case we changed it */ + + osd_req_op_extent_update(req, 0, len); + + vino = ceph_vino(inode); + ceph_osdc_build_request(req, offset, snapc, vino.snap, + &inode->i_mtime); rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); BUG_ON(rc); @@ -1067,51 +1104,23 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { struct inode *inode = file_inode(file); - struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_file_info *fi = file->private_data; struct page *page; pgoff_t index = pos >> PAGE_CACHE_SHIFT; - int r, want, got = 0; - - if (fi->fmode & CEPH_FILE_MODE_LAZY) - want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; - else - want = CEPH_CAP_FILE_BUFFER; - - dout("write_begin %p %llx.%llx %llu~%u getting caps. i_size %llu\n", - inode, ceph_vinop(inode), pos, len, inode->i_size); - r = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos+len); - if (r < 0) - return r; - dout("write_begin %p %llx.%llx %llu~%u got cap refs on %s\n", - inode, ceph_vinop(inode), pos, len, ceph_cap_string(got)); - if (!(got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO))) { - ceph_put_cap_refs(ci, got); - return -EAGAIN; - } + int r; do { /* get a page */ page = grab_cache_page_write_begin(mapping, index, 0); - if (!page) { - r = -ENOMEM; - break; - } + if (!page) + return -ENOMEM; + *pagep = page; dout("write_begin file %p inode %p page %p %d~%d\n", file, inode, page, (int)pos, (int)len); r = ceph_update_writeable_page(file, pos, len, page); - if (r) - page_cache_release(page); } while (r == -EAGAIN); - if (r) { - ceph_put_cap_refs(ci, got); - } else { - *pagep = page; - *(int *)fsdata = got; - } return r; } @@ -1125,12 +1134,10 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = file_inode(file); - struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_mds_client *mdsc = fsc->mdsc; unsigned from = pos & (PAGE_CACHE_SIZE - 1); int check_cap = 0; - int got = (unsigned long)fsdata; dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, inode, page, (int)pos, (int)copied, (int)len); @@ -1153,19 +1160,6 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, up_read(&mdsc->snap_rwsem); page_cache_release(page); - if (copied > 0) { - int dirty; - spin_lock(&ci->i_ceph_lock); - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); - spin_unlock(&ci->i_ceph_lock); - if (dirty) - __mark_inode_dirty(inode, dirty); - } - - dout("write_end %p %llx.%llx %llu~%u dropping cap refs on %s\n", - inode, ceph_vinop(inode), pos, len, ceph_cap_string(got)); - ceph_put_cap_refs(ci, got); - if (check_cap) ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 78e2f575247d..da0f9b8a3bcb 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -490,15 +490,17 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, ci->i_rdcache_gen++; /* - * if we are newly issued FILE_SHARED, clear D_COMPLETE; we + * if we are newly issued FILE_SHARED, mark dir not complete; we * don't know what happened to this directory while we didn't * have the cap. */ if ((issued & CEPH_CAP_FILE_SHARED) && (had & CEPH_CAP_FILE_SHARED) == 0) { ci->i_shared_gen++; - if (S_ISDIR(ci->vfs_inode.i_mode)) - ceph_dir_clear_complete(&ci->vfs_inode); + if (S_ISDIR(ci->vfs_inode.i_mode)) { + dout(" marking %p NOT complete\n", &ci->vfs_inode); + __ceph_dir_clear_complete(ci); + } } } @@ -553,6 +555,7 @@ retry: cap->implemented = 0; cap->mds = mds; cap->mds_wanted = 0; + cap->mseq = 0; cap->ci = ci; __insert_cap_node(ci, cap); @@ -628,7 +631,10 @@ retry: cap->cap_id = cap_id; cap->issued = issued; cap->implemented |= issued; - cap->mds_wanted |= wanted; + if (mseq > cap->mseq) + cap->mds_wanted = wanted; + else + cap->mds_wanted |= wanted; cap->seq = seq; cap->issue_seq = seq; cap->mseq = mseq; @@ -997,9 +1003,9 @@ static int send_cap_msg(struct ceph_mds_session *session, return 0; } -static void __queue_cap_release(struct ceph_mds_session *session, - u64 ino, u64 cap_id, u32 migrate_seq, - u32 issue_seq) +void __queue_cap_release(struct ceph_mds_session *session, + u64 ino, u64 cap_id, u32 migrate_seq, + u32 issue_seq) { struct ceph_msg *msg; struct ceph_mds_cap_release *head; @@ -2046,6 +2052,13 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, goto out; } + /* finish pending truncate */ + while (ci->i_truncate_pending) { + spin_unlock(&ci->i_ceph_lock); + __ceph_do_pending_vmtruncate(inode, !(need & CEPH_CAP_FILE_WR)); + spin_lock(&ci->i_ceph_lock); + } + if (need & CEPH_CAP_FILE_WR) { if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) { dout("get_cap_refs %p endoff %llu > maxsize %llu\n", @@ -2067,12 +2080,6 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, } have = __ceph_caps_issued(ci, &implemented); - /* - * disallow writes while a truncate is pending - */ - if (ci->i_truncate_pending) - have &= ~CEPH_CAP_FILE_WR; - if ((have & need) == need) { /* * Look at (implemented & ~have & not) so that we keep waiting diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 6d797f46d772..f02d82b7933e 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -107,7 +107,7 @@ static unsigned fpos_off(loff_t p) * falling back to a "normal" sync readdir if any dentries in the dir * are dropped. * - * D_COMPLETE tells indicates we have all dentries in the dir. It is + * Complete dir indicates that we have all dentries in the dir. It is * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by * the MDS if/when the directory is modified). */ @@ -198,8 +198,8 @@ more: filp->f_pos++; /* make sure a dentry wasn't dropped while we didn't have parent lock */ - if (!ceph_dir_test_complete(dir)) { - dout(" lost D_COMPLETE on %p; falling back to mds\n", dir); + if (!ceph_dir_is_complete(dir)) { + dout(" lost dir complete on %p; falling back to mds\n", dir); err = -EAGAIN; goto out; } @@ -258,7 +258,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) if (filp->f_pos == 0) { /* note dir version at start of readdir so we can tell * if any dentries get dropped */ - fi->dir_release_count = ci->i_release_count; + fi->dir_release_count = atomic_read(&ci->i_release_count); dout("readdir off 0 -> '.'\n"); if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0), @@ -284,7 +284,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) if ((filp->f_pos == 2 || fi->dentry) && !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && ceph_snap(inode) != CEPH_SNAPDIR && - ceph_dir_test_complete(inode) && + __ceph_dir_is_complete(ci) && __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { spin_unlock(&ci->i_ceph_lock); err = __dcache_readdir(filp, dirent, filldir); @@ -350,7 +350,8 @@ more: if (!req->r_did_prepopulate) { dout("readdir !did_prepopulate"); - fi->dir_release_count--; /* preclude D_COMPLETE */ + /* preclude from marking dir complete */ + fi->dir_release_count--; } /* note next offset and last dentry name */ @@ -428,8 +429,9 @@ more: * the complete dir contents in our cache. */ spin_lock(&ci->i_ceph_lock); - if (ci->i_release_count == fi->dir_release_count) { - ceph_dir_set_complete(inode); + if (atomic_read(&ci->i_release_count) == fi->dir_release_count) { + dout(" marking %p complete\n", inode); + __ceph_dir_set_complete(ci, fi->dir_release_count); ci->i_max_offset = filp->f_pos; } spin_unlock(&ci->i_ceph_lock); @@ -604,7 +606,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, fsc->mount_options->snapdir_name, dentry->d_name.len) && !is_root_ceph_dentry(dir, dentry) && - ceph_dir_test_complete(dir) && + __ceph_dir_is_complete(ci) && (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { spin_unlock(&ci->i_ceph_lock); dout(" dir %p complete, -ENOENT\n", dir); @@ -1065,44 +1067,6 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry, } /* - * Set/clear/test dir complete flag on the dir's dentry. - */ -void ceph_dir_set_complete(struct inode *inode) -{ - struct dentry *dentry = d_find_any_alias(inode); - - if (dentry && ceph_dentry(dentry) && - ceph_test_mount_opt(ceph_sb_to_client(dentry->d_sb), DCACHE)) { - dout(" marking %p (%p) complete\n", inode, dentry); - set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); - } - dput(dentry); -} - -void ceph_dir_clear_complete(struct inode *inode) -{ - struct dentry *dentry = d_find_any_alias(inode); - - if (dentry && ceph_dentry(dentry)) { - dout(" marking %p (%p) complete\n", inode, dentry); - set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); - } - dput(dentry); -} - -bool ceph_dir_test_complete(struct inode *inode) -{ - struct dentry *dentry = d_find_any_alias(inode); - - if (dentry && ceph_dentry(dentry)) { - dout(" marking %p (%p) NOT complete\n", inode, dentry); - clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); - } - dput(dentry); - return false; -} - -/* * When the VFS prunes a dentry from the cache, we need to clear the * complete flag on the parent directory. * @@ -1110,15 +1074,13 @@ bool ceph_dir_test_complete(struct inode *inode) */ static void ceph_d_prune(struct dentry *dentry) { - struct ceph_dentry_info *di; - dout("ceph_d_prune %p\n", dentry); /* do we have a valid parent? */ if (IS_ROOT(dentry)) return; - /* if we are not hashed, we don't affect D_COMPLETE */ + /* if we are not hashed, we don't affect dir's completeness */ if (d_unhashed(dentry)) return; @@ -1126,8 +1088,7 @@ static void ceph_d_prune(struct dentry *dentry) * we hold d_lock, so d_parent is stable, and d_fsdata is never * cleared until d_release */ - di = ceph_dentry(dentry->d_parent); - clear_bit(CEPH_D_COMPLETE, &di->flags); + ceph_dir_clear_complete(dentry->d_parent->d_inode); } /* diff --git a/fs/ceph/file.c b/fs/ceph/file.c index bf338d9b67e3..656e16907430 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -7,6 +7,7 @@ #include <linux/mount.h> #include <linux/namei.h> #include <linux/writeback.h> +#include <linux/aio.h> #include "super.h" #include "mds_client.h" @@ -446,19 +447,35 @@ done: } /* - * Write commit callback, called if we requested both an ACK and - * ONDISK commit reply from the OSD. + * Write commit request unsafe callback, called to tell us when a + * request is unsafe (that is, in flight--has been handed to the + * messenger to send to its target osd). It is called again when + * we've received a response message indicating the request is + * "safe" (its CEPH_OSD_FLAG_ONDISK flag is set), or when a request + * is completed early (and unsuccessfully) due to a timeout or + * interrupt. + * + * This is used if we requested both an ACK and ONDISK commit reply + * from the OSD. */ -static void sync_write_commit(struct ceph_osd_request *req, - struct ceph_msg *msg) +static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) { struct ceph_inode_info *ci = ceph_inode(req->r_inode); - dout("sync_write_commit %p tid %llu\n", req, req->r_tid); - spin_lock(&ci->i_unsafe_lock); - list_del_init(&req->r_unsafe_item); - spin_unlock(&ci->i_unsafe_lock); - ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); + dout("%s %p tid %llu %ssafe\n", __func__, req, req->r_tid, + unsafe ? "un" : ""); + if (unsafe) { + ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); + spin_lock(&ci->i_unsafe_lock); + list_add_tail(&req->r_unsafe_item, + &ci->i_unsafe_writes); + spin_unlock(&ci->i_unsafe_lock); + } else { + spin_lock(&ci->i_unsafe_lock); + list_del_init(&req->r_unsafe_item); + spin_unlock(&ci->i_unsafe_lock); + ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); + } } /* @@ -470,36 +487,33 @@ static void sync_write_commit(struct ceph_osd_request *req, * objects, rollback on failure, etc.) */ static ssize_t ceph_sync_write(struct file *file, const char __user *data, - size_t left, loff_t *offset) + size_t left, loff_t pos, loff_t *ppos) { struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_snap_context *snapc; + struct ceph_vino vino; struct ceph_osd_request *req; + int num_ops = 1; struct page **pages; int num_pages; - long long unsigned pos; u64 len; int written = 0; int flags; - int do_sync = 0; int check_caps = 0; int page_align, io_align; unsigned long buf_align; int ret; struct timespec mtime = CURRENT_TIME; + bool own_pages = false; if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; - dout("sync_write on file %p %lld~%u %s\n", file, *offset, + dout("sync_write on file %p %lld~%u %s\n", file, pos, (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); - if (file->f_flags & O_APPEND) - pos = i_size_read(inode); - else - pos = *offset; - ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); if (ret < 0) return ret; @@ -516,7 +530,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0) flags |= CEPH_OSD_FLAG_ACK; else - do_sync = 1; + num_ops++; /* Also include a 'startsync' command. */ /* * we may need to do multiple writes here if we span an object @@ -526,25 +540,20 @@ more: io_align = pos & ~PAGE_MASK; buf_align = (unsigned long)data & ~PAGE_MASK; len = left; - if (file->f_flags & O_DIRECT) { - /* write from beginning of first page, regardless of - io alignment */ - page_align = (pos - io_align + buf_align) & ~PAGE_MASK; - num_pages = calc_pages_for((unsigned long)data, len); - } else { - page_align = pos & ~PAGE_MASK; - num_pages = calc_pages_for(pos, len); - } + + snapc = ci->i_snap_realm->cached_context; + vino = ceph_vino(inode); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, - ceph_vino(inode), pos, &len, - CEPH_OSD_OP_WRITE, flags, - ci->i_snap_realm->cached_context, - do_sync, + vino, pos, &len, num_ops, + CEPH_OSD_OP_WRITE, flags, snapc, ci->i_truncate_seq, ci->i_truncate_size, - &mtime, false, page_align); + false); if (IS_ERR(req)) return PTR_ERR(req); + /* write from beginning of first page, regardless of io alignment */ + page_align = file->f_flags & O_DIRECT ? buf_align : io_align; + num_pages = calc_pages_for(page_align, len); if (file->f_flags & O_DIRECT) { pages = ceph_get_direct_page_vector(data, num_pages, false); if (IS_ERR(pages)) { @@ -572,36 +581,20 @@ more: if ((file->f_flags & O_SYNC) == 0) { /* get a second commit callback */ - req->r_safe_callback = sync_write_commit; - req->r_own_pages = 1; + req->r_unsafe_callback = ceph_sync_write_unsafe; + req->r_inode = inode; + own_pages = true; } } - req->r_pages = pages; - req->r_num_pages = num_pages; - req->r_inode = inode; + osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align, + false, own_pages); + + /* BUG_ON(vino.snap != CEPH_NOSNAP); */ + ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); - if (!ret) { - if (req->r_safe_callback) { - /* - * Add to inode unsafe list only after we - * start_request so that a tid has been assigned. - */ - spin_lock(&ci->i_unsafe_lock); - list_add_tail(&req->r_unsafe_item, - &ci->i_unsafe_writes); - spin_unlock(&ci->i_unsafe_lock); - ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); - } - + if (!ret) ret = ceph_osdc_wait_request(&fsc->client->osdc, req); - if (ret < 0 && req->r_safe_callback) { - spin_lock(&ci->i_unsafe_lock); - list_del_init(&req->r_unsafe_item); - spin_unlock(&ci->i_unsafe_lock); - ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); - } - } if (file->f_flags & O_DIRECT) ceph_put_page_vector(pages, num_pages, false); @@ -614,12 +607,12 @@ out: pos += len; written += len; left -= len; - data += written; + data += len; if (left) goto more; ret = written; - *offset = pos; + *ppos = pos; if (pos > i_size_read(inode)) check_caps = ceph_inode_set_size(inode, pos); if (check_caps) @@ -653,7 +646,6 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov, dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", inode, ceph_vinop(inode), pos, (unsigned)len, inode); again: - __ceph_do_pending_vmtruncate(inode); if (fi->fmode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; else @@ -717,55 +709,75 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->client->osdc; - loff_t endoff = pos + iov->iov_len; - int got = 0; - int ret, err, written; + ssize_t count, written = 0; + int err, want, got; + bool hold_mutex; if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; -retry_snap: - written = 0; - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) - return -ENOSPC; - __ceph_do_pending_vmtruncate(inode); + sb_start_write(inode->i_sb); + mutex_lock(&inode->i_mutex); + hold_mutex = true; - /* - * try to do a buffered write. if we don't have sufficient - * caps, we'll get -EAGAIN from generic_file_aio_write, or a - * short write if we only get caps for some pages. - */ - if (!(iocb->ki_filp->f_flags & O_DIRECT) && - !(inode->i_sb->s_flags & MS_SYNCHRONOUS) && - !(fi->flags & CEPH_F_SYNC)) { - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); - if (ret >= 0) - written = ret; - - if ((ret >= 0 || ret == -EIOCBQUEUED) && - ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) - || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { - err = vfs_fsync_range(file, pos, pos + written - 1, 1); - if (err < 0) - ret = err; - } - if ((ret < 0 && ret != -EAGAIN) || pos + written >= endoff) - goto out; + err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); + if (err) + goto out; + + /* We can write back this queue in page reclaim */ + current->backing_dev_info = file->f_mapping->backing_dev_info; + + err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); + if (err) + goto out; + + if (count == 0) + goto out; + + err = file_remove_suid(file); + if (err) + goto out; + + err = file_update_time(file); + if (err) + goto out; + +retry_snap: + if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) { + err = -ENOSPC; + goto out; } - dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", - inode, ceph_vinop(inode), pos + written, - (unsigned)iov->iov_len - written, inode->i_size); - ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, 0, &got, endoff); - if (ret < 0) + dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n", + inode, ceph_vinop(inode), pos, count, inode->i_size); + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_BUFFER; + got = 0; + err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos + count); + if (err < 0) goto out; - dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n", - inode, ceph_vinop(inode), pos + written, - (unsigned)iov->iov_len - written, ceph_cap_string(got)); - ret = ceph_sync_write(file, iov->iov_base + written, - iov->iov_len - written, &iocb->ki_pos); - if (ret >= 0) { + dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n", + inode, ceph_vinop(inode), pos, count, ceph_cap_string(got)); + + if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || + (iocb->ki_filp->f_flags & O_DIRECT) || + (inode->i_sb->s_flags & MS_SYNCHRONOUS) || + (fi->flags & CEPH_F_SYNC)) { + mutex_unlock(&inode->i_mutex); + written = ceph_sync_write(file, iov->iov_base, count, + pos, &iocb->ki_pos); + } else { + written = generic_file_buffered_write(iocb, iov, nr_segs, + pos, &iocb->ki_pos, + count, 0); + mutex_unlock(&inode->i_mutex); + } + hold_mutex = false; + + if (written >= 0) { int dirty; spin_lock(&ci->i_ceph_lock); dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); @@ -773,18 +785,34 @@ retry_snap: if (dirty) __mark_inode_dirty(inode, dirty); } + dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", - inode, ceph_vinop(inode), pos + written, - (unsigned)iov->iov_len - written, ceph_cap_string(got)); + inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, + ceph_cap_string(got)); ceph_put_cap_refs(ci, got); -out: - if (ret == -EOLDSNAPC) { + + if (written >= 0 && + ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) || + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { + err = vfs_fsync_range(file, pos, pos + written - 1, 1); + if (err < 0) + written = err; + } + + if (written == -EOLDSNAPC) { dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len); + mutex_lock(&inode->i_mutex); + hold_mutex = true; goto retry_snap; } +out: + if (hold_mutex) + mutex_unlock(&inode->i_mutex); + sb_end_write(inode->i_sb); + current->backing_dev_info = NULL; - return ret; + return written ? written : err; } /* @@ -796,7 +824,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) int ret; mutex_lock(&inode->i_mutex); - __ceph_do_pending_vmtruncate(inode); + __ceph_do_pending_vmtruncate(inode, false); if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 851814d951cd..be0f7e20d62e 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -302,7 +302,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_version = 0; ci->i_time_warp_seq = 0; ci->i_ceph_flags = 0; - ci->i_release_count = 0; + atomic_set(&ci->i_release_count, 1); + atomic_set(&ci->i_complete_count, 0); ci->i_symlink = NULL; memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); @@ -561,7 +562,6 @@ static int fill_inode(struct inode *inode, struct ceph_inode_info *ci = ceph_inode(inode); int i; int issued = 0, implemented; - int updating_inode = 0; struct timespec mtime, atime, ctime; u32 nsplits; struct ceph_buffer *xattr_blob = NULL; @@ -601,7 +601,6 @@ static int fill_inode(struct inode *inode, (ci->i_version & ~1) >= le64_to_cpu(info->version)) goto no_change; - updating_inode = 1; issued = __ceph_caps_issued(ci, &implemented); issued |= implemented | __ceph_caps_dirty(ci); @@ -717,6 +716,17 @@ static int fill_inode(struct inode *inode, ceph_vinop(inode), inode->i_mode); } + /* set dir completion flag? */ + if (S_ISDIR(inode->i_mode) && + ci->i_files == 0 && ci->i_subdirs == 0 && + ceph_snap(inode) == CEPH_NOSNAP && + (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && + (issued & CEPH_CAP_FILE_EXCL) == 0 && + !__ceph_dir_is_complete(ci)) { + dout(" marking %p complete (empty)\n", inode); + __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count)); + ci->i_max_offset = 2; + } no_change: spin_unlock(&ci->i_ceph_lock); @@ -767,19 +777,6 @@ no_change: __ceph_get_fmode(ci, cap_fmode); } - /* set dir completion flag? */ - if (S_ISDIR(inode->i_mode) && - updating_inode && /* didn't jump to no_change */ - ci->i_files == 0 && ci->i_subdirs == 0 && - ceph_snap(inode) == CEPH_NOSNAP && - (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && - (issued & CEPH_CAP_FILE_EXCL) == 0 && - !ceph_dir_test_complete(inode)) { - dout(" marking %p complete (empty)\n", inode); - ceph_dir_set_complete(inode); - ci->i_max_offset = 2; - } - /* update delegation info? */ if (dirinfo) ceph_fill_dirfrag(inode, dirinfo); @@ -861,7 +858,7 @@ static void ceph_set_dentry_offset(struct dentry *dn) di = ceph_dentry(dn); spin_lock(&ci->i_ceph_lock); - if (!ceph_dir_test_complete(inode)) { + if (!__ceph_dir_is_complete(ci)) { spin_unlock(&ci->i_ceph_lock); return; } @@ -1065,8 +1062,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, /* * d_move() puts the renamed dentry at the end of * d_subdirs. We need to assign it an appropriate - * directory offset so we can behave when holding - * D_COMPLETE. + * directory offset so we can behave when dir is + * complete. */ ceph_set_dentry_offset(req->r_old_dentry); dout("dn %p gets new offset %lld\n", req->r_old_dentry, @@ -1457,7 +1454,7 @@ out: /* - * called by trunc_wq; take i_mutex ourselves + * called by trunc_wq; * * We also truncate in a separate thread as well. */ @@ -1468,9 +1465,7 @@ static void ceph_vmtruncate_work(struct work_struct *work) struct inode *inode = &ci->vfs_inode; dout("vmtruncate_work %p\n", inode); - mutex_lock(&inode->i_mutex); - __ceph_do_pending_vmtruncate(inode); - mutex_unlock(&inode->i_mutex); + __ceph_do_pending_vmtruncate(inode, true); iput(inode); } @@ -1494,12 +1489,10 @@ void ceph_queue_vmtruncate(struct inode *inode) } /* - * called with i_mutex held. - * * Make sure any pending truncation is applied before doing anything * that may depend on it. */ -void __ceph_do_pending_vmtruncate(struct inode *inode) +void __ceph_do_pending_vmtruncate(struct inode *inode, bool needlock) { struct ceph_inode_info *ci = ceph_inode(inode); u64 to; @@ -1532,7 +1525,11 @@ retry: ci->i_truncate_pending, to); spin_unlock(&ci->i_ceph_lock); + if (needlock) + mutex_lock(&inode->i_mutex); truncate_inode_pages(inode->i_mapping, to); + if (needlock) + mutex_unlock(&inode->i_mutex); spin_lock(&ci->i_ceph_lock); if (to == ci->i_truncate_size) { @@ -1563,6 +1560,12 @@ static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd) static const struct inode_operations ceph_symlink_iops = { .readlink = generic_readlink, .follow_link = ceph_sym_follow_link, + .setattr = ceph_setattr, + .getattr = ceph_getattr, + .setxattr = ceph_setxattr, + .getxattr = ceph_getxattr, + .listxattr = ceph_listxattr, + .removexattr = ceph_removexattr, }; /* @@ -1585,7 +1588,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; - __ceph_do_pending_vmtruncate(inode); + __ceph_do_pending_vmtruncate(inode, false); err = inode_change_ok(inode, attr); if (err != 0) @@ -1767,7 +1770,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) ceph_cap_string(dirtied), mask); ceph_mdsc_put_request(req); - __ceph_do_pending_vmtruncate(inode); + __ceph_do_pending_vmtruncate(inode, false); return err; out: spin_unlock(&ci->i_ceph_lock); diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 4a989345b37b..e0b4ef31d3c8 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -208,8 +208,9 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", ceph_ino(inode), dl.object_no); - ceph_calc_object_layout(&pgid, dl.object_name, &ci->i_layout, - osdc->osdmap); + + ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap, + ceph_file_layout_pg_pool(ci->i_layout)); dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid); if (dl.osd >= 0) { diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 442880d099c9..4f22671a5bd4 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -265,7 +265,8 @@ static int parse_reply_info_extra(void **p, void *end, { if (info->head->op == CEPH_MDS_OP_GETFILELOCK) return parse_reply_info_filelock(p, end, info, features); - else if (info->head->op == CEPH_MDS_OP_READDIR) + else if (info->head->op == CEPH_MDS_OP_READDIR || + info->head->op == CEPH_MDS_OP_LSSNAP) return parse_reply_info_dir(p, end, info, features); else if (info->head->op == CEPH_MDS_OP_CREATE) return parse_reply_info_create(p, end, info, features); @@ -364,9 +365,9 @@ void ceph_put_mds_session(struct ceph_mds_session *s) atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1); if (atomic_dec_and_test(&s->s_ref)) { if (s->s_auth.authorizer) - s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer( - s->s_mdsc->fsc->client->monc.auth, - s->s_auth.authorizer); + ceph_auth_destroy_authorizer( + s->s_mdsc->fsc->client->monc.auth, + s->s_auth.authorizer); kfree(s); } } @@ -1196,6 +1197,8 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) session->s_trim_caps--; if (oissued) { /* we aren't the only cap.. just remove us */ + __queue_cap_release(session, ceph_ino(inode), cap->cap_id, + cap->mseq, cap->issue_seq); __ceph_remove_cap(cap); } else { /* try to drop referring dentries */ @@ -1718,8 +1721,12 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); - msg->pages = req->r_pages; - msg->nr_pages = req->r_num_pages; + if (req->r_data_len) { + /* outbound data set only by ceph_sync_setxattr() */ + BUG_ON(!req->r_pages); + ceph_msg_data_add_pages(msg, req->r_pages, req->r_data_len, 0); + } + msg->hdr.data_len = cpu_to_le32(req->r_data_len); msg->hdr.data_off = cpu_to_le16(0); @@ -1913,6 +1920,7 @@ static void __wake_requests(struct ceph_mds_client *mdsc, req = list_entry(tmp_list.next, struct ceph_mds_request, r_wait); list_del_init(&req->r_wait); + dout(" wake request %p tid %llu\n", req, req->r_tid); __do_request(mdsc, req); } } @@ -2026,20 +2034,16 @@ out: } /* - * Invalidate dir D_COMPLETE, dentry lease state on an aborted MDS + * Invalidate dir's completeness, dentry lease state on an aborted MDS * namespace request. */ void ceph_invalidate_dir_request(struct ceph_mds_request *req) { struct inode *inode = req->r_locked_dir; - struct ceph_inode_info *ci = ceph_inode(inode); - dout("invalidate_dir_request %p (D_COMPLETE, lease(s))\n", inode); - spin_lock(&ci->i_ceph_lock); - ceph_dir_clear_complete(inode); - ci->i_release_count++; - spin_unlock(&ci->i_ceph_lock); + dout("invalidate_dir_request %p (complete, lease(s))\n", inode); + ceph_dir_clear_complete(inode); if (req->r_dentry) ceph_invalidate_dentry_lease(req->r_dentry); if (req->r_old_dentry) @@ -2599,11 +2603,13 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, goto fail; } - reply->pagelist = pagelist; if (recon_state.flock) reply->hdr.version = cpu_to_le16(2); - reply->hdr.data_len = cpu_to_le32(pagelist->length); - reply->nr_pages = calc_pages_for(0, pagelist->length); + if (pagelist->length) { + /* set up outbound data if we have any */ + reply->hdr.data_len = cpu_to_le32(pagelist->length); + ceph_msg_data_add_pagelist(reply, pagelist); + } ceph_con_send(&session->s_con, reply); mutex_unlock(&session->s_mutex); @@ -3433,13 +3439,17 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, struct ceph_auth_handshake *auth = &s->s_auth; if (force_new && auth->authorizer) { - if (ac->ops && ac->ops->destroy_authorizer) - ac->ops->destroy_authorizer(ac, auth->authorizer); + ceph_auth_destroy_authorizer(ac, auth->authorizer); auth->authorizer = NULL; } - if (!auth->authorizer && ac->ops && ac->ops->create_authorizer) { - int ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_MDS, - auth); + if (!auth->authorizer) { + int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_MDS, + auth); + if (ret) + return ERR_PTR(ret); + } else { + int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_MDS, + auth); if (ret) return ERR_PTR(ret); } @@ -3455,7 +3465,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len) struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; - return ac->ops->verify_authorizer_reply(ac, s->s_auth.authorizer, len); + return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer, len); } static int invalidate_authorizer(struct ceph_connection *con) @@ -3464,12 +3474,32 @@ static int invalidate_authorizer(struct ceph_connection *con) struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; - if (ac->ops->invalidate_authorizer) - ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS); + ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS); return ceph_monc_validate_auth(&mdsc->fsc->client->monc); } +static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con, + struct ceph_msg_header *hdr, int *skip) +{ + struct ceph_msg *msg; + int type = (int) le16_to_cpu(hdr->type); + int front_len = (int) le32_to_cpu(hdr->front_len); + + if (con->in_msg) + return con->in_msg; + + *skip = 0; + msg = ceph_msg_new(type, front_len, GFP_NOFS, false); + if (!msg) { + pr_err("unable to allocate msg type %d len %d\n", + type, front_len); + return NULL; + } + + return msg; +} + static const struct ceph_connection_operations mds_con_ops = { .get = con_get, .put = con_put, @@ -3478,6 +3508,7 @@ static const struct ceph_connection_operations mds_con_ops = { .verify_authorizer_reply = verify_authorizer_reply, .invalidate_authorizer = invalidate_authorizer, .peer_reset = peer_reset, + .alloc_msg = mds_alloc_msg, }; /* eof */ diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 0d3c9240c61b..9278dec9e940 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -20,7 +20,10 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) { int n = 0; int i; - char r; + + /* special case for one mds */ + if (1 == m->m_max_mds && m->m_info[0].state > 0) + return 0; /* count */ for (i = 0; i < m->m_max_mds; i++) @@ -30,8 +33,7 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) return -1; /* pick */ - get_random_bytes(&r, 1); - n = r % n; + n = prandom_u32() % n; i = 0; for (i = 0; n > 0; i++, n--) while (m->m_info[i].state <= 0) diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index cbb2f54a3019..f01645a27752 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -332,10 +332,9 @@ static int build_snap_context(struct ceph_snap_realm *realm) err = -ENOMEM; if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64)) goto fail; - snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS); + snapc = ceph_create_snap_context(num, GFP_NOFS); if (!snapc) goto fail; - atomic_set(&snapc->nref, 1); /* build (reverse sorted) snap vector */ num = 0; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 9fe17c6c2876..7d377c9a5e35 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -479,6 +479,8 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, CEPH_FEATURE_FLOCK | CEPH_FEATURE_DIRLAYOUTHASH; const unsigned required_features = 0; + int page_count; + size_t size; int err = -ENOMEM; fsc = kzalloc(sizeof(*fsc), GFP_KERNEL); @@ -522,8 +524,9 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, /* set up mempools */ err = -ENOMEM; - fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, - fsc->mount_options->wsize >> PAGE_CACHE_SHIFT); + page_count = fsc->mount_options->wsize >> PAGE_CACHE_SHIFT; + size = sizeof (struct page *) * (page_count ? page_count : 1); + fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size); if (!fsc->wb_pagevec_pool) goto fail_trunc_wq; @@ -952,6 +955,7 @@ static struct file_system_type ceph_fs_type = { .kill_sb = ceph_kill_sb, .fs_flags = FS_RENAME_DOES_D_MOVE, }; +MODULE_ALIAS_FS("ceph"); #define _STRINGIFY(x) #x #define STRINGIFY(x) _STRINGIFY(x) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index c7b309723dcc..8696be2ff679 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -204,7 +204,6 @@ struct ceph_inode_xattr { * Ceph dentry state */ struct ceph_dentry_info { - unsigned long flags; struct ceph_mds_session *lease_session; u32 lease_gen, lease_shared_gen; u32 lease_seq; @@ -215,18 +214,6 @@ struct ceph_dentry_info { u64 offset; }; -/* - * dentry flags - * - * The locking for D_COMPLETE is a bit odd: - * - we can clear it at almost any time (see ceph_d_prune) - * - it is only meaningful if: - * - we hold dir inode i_ceph_lock - * - we hold dir FILE_SHARED caps - * - the dentry D_COMPLETE is set - */ -#define CEPH_D_COMPLETE 1 /* if set, d_u.d_subdirs is complete directory */ - struct ceph_inode_xattrs_info { /* * (still encoded) xattr blob. we avoid the overhead of parsing @@ -257,7 +244,8 @@ struct ceph_inode_info { u32 i_time_warp_seq; unsigned i_ceph_flags; - unsigned long i_release_count; + atomic_t i_release_count; + atomic_t i_complete_count; struct ceph_dir_layout i_dir_layout; struct ceph_file_layout i_layout; @@ -267,7 +255,7 @@ struct ceph_inode_info { struct timespec i_rctime; u64 i_rbytes, i_rfiles, i_rsubdirs; u64 i_files, i_subdirs; - u64 i_max_offset; /* largest readdir offset, set with D_COMPLETE */ + u64 i_max_offset; /* largest readdir offset, set with complete dir */ struct rb_root i_fragtree; struct mutex i_fragtree_mutex; @@ -436,33 +424,31 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, #define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ #define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ -static inline void ceph_i_clear(struct inode *inode, unsigned mask) +static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, + int release_count) { - struct ceph_inode_info *ci = ceph_inode(inode); - - spin_lock(&ci->i_ceph_lock); - ci->i_ceph_flags &= ~mask; - spin_unlock(&ci->i_ceph_lock); + atomic_set(&ci->i_complete_count, release_count); } -static inline void ceph_i_set(struct inode *inode, unsigned mask) +static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci) { - struct ceph_inode_info *ci = ceph_inode(inode); + atomic_inc(&ci->i_release_count); +} - spin_lock(&ci->i_ceph_lock); - ci->i_ceph_flags |= mask; - spin_unlock(&ci->i_ceph_lock); +static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci) +{ + return atomic_read(&ci->i_complete_count) == + atomic_read(&ci->i_release_count); } -static inline bool ceph_i_test(struct inode *inode, unsigned mask) +static inline void ceph_dir_clear_complete(struct inode *inode) { - struct ceph_inode_info *ci = ceph_inode(inode); - bool r; + __ceph_dir_clear_complete(ceph_inode(inode)); +} - spin_lock(&ci->i_ceph_lock); - r = (ci->i_ceph_flags & mask) == mask; - spin_unlock(&ci->i_ceph_lock); - return r; +static inline bool ceph_dir_is_complete(struct inode *inode) +{ + return __ceph_dir_is_complete(ceph_inode(inode)); } @@ -489,13 +475,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off) } /* - * set/clear directory D_COMPLETE flag - */ -void ceph_dir_set_complete(struct inode *inode); -void ceph_dir_clear_complete(struct inode *inode); -bool ceph_dir_test_complete(struct inode *inode); - -/* * caps helpers */ static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci) @@ -584,7 +563,7 @@ struct ceph_file_info { u64 next_offset; /* offset of next chunk (last_name's + 1) */ char *last_name; /* last entry in previous chunk */ struct dentry *dentry; /* next dentry (for dcache readdir) */ - unsigned long dir_release_count; + int dir_release_count; /* used for -o dirstat read() on directory thing */ char *dir_info; @@ -713,7 +692,7 @@ extern int ceph_readdir_prepopulate(struct ceph_mds_request *req, extern int ceph_inode_holds_cap(struct inode *inode, int mask); extern int ceph_inode_set_size(struct inode *inode, loff_t size); -extern void __ceph_do_pending_vmtruncate(struct inode *inode); +extern void __ceph_do_pending_vmtruncate(struct inode *inode, bool needlock); extern void ceph_queue_vmtruncate(struct inode *inode); extern void ceph_queue_invalidate(struct inode *inode); @@ -755,6 +734,8 @@ static inline void ceph_remove_cap(struct ceph_cap *cap) extern void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap); +extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino, + u64 cap_id, u32 migrate_seq, u32 issue_seq); extern void ceph_queue_caps_release(struct inode *inode); extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); extern int ceph_fsync(struct file *file, loff_t start, loff_t end, diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c index cfd1ce34e0bc..a3b56544c21b 100644 --- a/fs/cifs/asn1.c +++ b/fs/cifs/asn1.c @@ -506,11 +506,11 @@ decode_negTokenInit(unsigned char *security_blob, int length, /* GSSAPI header */ if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, "Error decoding negTokenInit header"); + cifs_dbg(FYI, "Error decoding negTokenInit header\n"); return 0; } else if ((cls != ASN1_APL) || (con != ASN1_CON) || (tag != ASN1_EOC)) { - cFYI(1, "cls = %d con = %d tag = %d", cls, con, tag); + cifs_dbg(FYI, "cls = %d con = %d tag = %d\n", cls, con, tag); return 0; } @@ -531,52 +531,52 @@ decode_negTokenInit(unsigned char *security_blob, int length, /* SPNEGO OID not present or garbled -- bail out */ if (!rc) { - cFYI(1, "Error decoding negTokenInit header"); + cifs_dbg(FYI, "Error decoding negTokenInit header\n"); return 0; } /* SPNEGO */ if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, "Error decoding negTokenInit"); + cifs_dbg(FYI, "Error decoding negTokenInit\n"); return 0; } else if ((cls != ASN1_CTX) || (con != ASN1_CON) || (tag != ASN1_EOC)) { - cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 0", - cls, con, tag, end, *end); + cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 0\n", + cls, con, tag, end, *end); return 0; } /* negTokenInit */ if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, "Error decoding negTokenInit"); + cifs_dbg(FYI, "Error decoding negTokenInit\n"); return 0; } else if ((cls != ASN1_UNI) || (con != ASN1_CON) || (tag != ASN1_SEQ)) { - cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 1", - cls, con, tag, end, *end); + cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 1\n", + cls, con, tag, end, *end); return 0; } /* sequence */ if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, "Error decoding 2nd part of negTokenInit"); + cifs_dbg(FYI, "Error decoding 2nd part of negTokenInit\n"); return 0; } else if ((cls != ASN1_CTX) || (con != ASN1_CON) || (tag != ASN1_EOC)) { - cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 0", - cls, con, tag, end, *end); + cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 0\n", + cls, con, tag, end, *end); return 0; } /* sequence of */ if (asn1_header_decode (&ctx, &sequence_end, &cls, &con, &tag) == 0) { - cFYI(1, "Error decoding 2nd part of negTokenInit"); + cifs_dbg(FYI, "Error decoding 2nd part of negTokenInit\n"); return 0; } else if ((cls != ASN1_UNI) || (con != ASN1_CON) || (tag != ASN1_SEQ)) { - cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 1", - cls, con, tag, end, *end); + cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 1\n", + cls, con, tag, end, *end); return 0; } @@ -584,15 +584,15 @@ decode_negTokenInit(unsigned char *security_blob, int length, while (!asn1_eoc_decode(&ctx, sequence_end)) { rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag); if (!rc) { - cFYI(1, "Error decoding negTokenInit hdr exit2"); + cifs_dbg(FYI, "Error decoding negTokenInit hdr exit2\n"); return 0; } if ((tag == ASN1_OJI) && (con == ASN1_PRI)) { if (asn1_oid_decode(&ctx, end, &oid, &oidlen)) { - cFYI(1, "OID len = %d oid = 0x%lx 0x%lx " - "0x%lx 0x%lx", oidlen, *oid, - *(oid + 1), *(oid + 2), *(oid + 3)); + cifs_dbg(FYI, "OID len = %d oid = 0x%lx 0x%lx 0x%lx 0x%lx\n", + oidlen, *oid, *(oid + 1), *(oid + 2), + *(oid + 3)); if (compare_oid(oid, oidlen, MSKRB5_OID, MSKRB5_OID_LEN)) @@ -610,57 +610,14 @@ decode_negTokenInit(unsigned char *security_blob, int length, kfree(oid); } } else { - cFYI(1, "Should be an oid what is going on?"); + cifs_dbg(FYI, "Should be an oid what is going on?\n"); } } - /* mechlistMIC */ - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - /* Check if we have reached the end of the blob, but with - no mechListMic (e.g. NTLMSSP instead of KRB5) */ - if (ctx.error == ASN1_ERR_DEC_EMPTY) - goto decode_negtoken_exit; - cFYI(1, "Error decoding last part negTokenInit exit3"); - return 0; - } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) { - /* tag = 3 indicating mechListMIC */ - cFYI(1, "Exit 4 cls = %d con = %d tag = %d end = %p (%d)", - cls, con, tag, end, *end); - return 0; - } - - /* sequence */ - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, "Error decoding last part negTokenInit exit5"); - return 0; - } else if ((cls != ASN1_UNI) || (con != ASN1_CON) - || (tag != ASN1_SEQ)) { - cFYI(1, "cls = %d con = %d tag = %d end = %p (%d)", - cls, con, tag, end, *end); - } - - /* sequence of */ - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, "Error decoding last part negTokenInit exit 7"); - return 0; - } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) { - cFYI(1, "Exit 8 cls = %d con = %d tag = %d end = %p (%d)", - cls, con, tag, end, *end); - return 0; - } - - /* general string */ - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, "Error decoding last part negTokenInit exit9"); - return 0; - } else if ((cls != ASN1_UNI) || (con != ASN1_PRI) - || (tag != ASN1_GENSTR)) { - cFYI(1, "Exit10 cls = %d con = %d tag = %d end = %p (%d)", - cls, con, tag, end, *end); - return 0; - } - cFYI(1, "Need to call asn1_octets_decode() function for %s", - ctx.pointer); /* is this UTF-8 or ASCII? */ -decode_negtoken_exit: + /* + * We currently ignore anything at the end of the SPNEGO blob after + * the mechTypes have been parsed, since none of that info is + * used at the moment. + */ return 1; } diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c index 282d6de7e410..6c665bf4a27c 100644 --- a/fs/cifs/cache.c +++ b/fs/cifs/cache.c @@ -92,7 +92,7 @@ static uint16_t cifs_server_get_key(const void *cookie_netfs_data, break; default: - cERROR(1, "Unknown network family '%d'", sa->sa_family); + cifs_dbg(VFS, "Unknown network family '%d'\n", sa->sa_family); key_len = 0; break; } @@ -152,7 +152,7 @@ static uint16_t cifs_super_get_key(const void *cookie_netfs_data, void *buffer, sharename = extract_sharename(tcon->treeName); if (IS_ERR(sharename)) { - cFYI(1, "%s: couldn't extract sharename", __func__); + cifs_dbg(FYI, "%s: couldn't extract sharename\n", __func__); sharename = NULL; return 0; } @@ -302,7 +302,7 @@ static void cifs_fscache_inode_now_uncached(void *cookie_netfs_data) pagevec_init(&pvec, 0); first = 0; - cFYI(1, "%s: cifs inode 0x%p now uncached", __func__, cifsi); + cifs_dbg(FYI, "%s: cifs inode 0x%p now uncached\n", __func__, cifsi); for (;;) { nr_pages = pagevec_lookup(&pvec, diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index d9ea6ede6a7a..d59748346020 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -57,15 +57,32 @@ cifs_dump_mem(char *label, void *data, int length) } } +#ifdef CONFIG_CIFS_DEBUG +void cifs_vfs_err(const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_ERR "CIFS VFS: %pV", &vaf); + + va_end(args); +} +#endif + void cifs_dump_detail(void *buf) { #ifdef CONFIG_CIFS_DEBUG2 struct smb_hdr *smb = (struct smb_hdr *)buf; - cERROR(1, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d", - smb->Command, smb->Status.CifsError, - smb->Flags, smb->Flags2, smb->Mid, smb->Pid); - cERROR(1, "smb buf %p len %u", smb, smbCalcSize(smb)); + cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d\n", + smb->Command, smb->Status.CifsError, + smb->Flags, smb->Flags2, smb->Mid, smb->Pid); + cifs_dbg(VFS, "smb buf %p len %u\n", smb, smbCalcSize(smb)); #endif /* CONFIG_CIFS_DEBUG2 */ } @@ -78,25 +95,25 @@ void cifs_dump_mids(struct TCP_Server_Info *server) if (server == NULL) return; - cERROR(1, "Dump pending requests:"); + cifs_dbg(VFS, "Dump pending requests:\n"); spin_lock(&GlobalMid_Lock); list_for_each(tmp, &server->pending_mid_q) { mid_entry = list_entry(tmp, struct mid_q_entry, qhead); - cERROR(1, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %llu", - mid_entry->mid_state, - le16_to_cpu(mid_entry->command), - mid_entry->pid, - mid_entry->callback_data, - mid_entry->mid); + cifs_dbg(VFS, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %llu\n", + mid_entry->mid_state, + le16_to_cpu(mid_entry->command), + mid_entry->pid, + mid_entry->callback_data, + mid_entry->mid); #ifdef CONFIG_CIFS_STATS2 - cERROR(1, "IsLarge: %d buf: %p time rcv: %ld now: %ld", - mid_entry->large_buf, - mid_entry->resp_buf, - mid_entry->when_received, - jiffies); + cifs_dbg(VFS, "IsLarge: %d buf: %p time rcv: %ld now: %ld\n", + mid_entry->large_buf, + mid_entry->resp_buf, + mid_entry->when_received, + jiffies); #endif /* STATS2 */ - cERROR(1, "IsMult: %d IsEnd: %d", mid_entry->multiRsp, - mid_entry->multiEnd); + cifs_dbg(VFS, "IsMult: %d IsEnd: %d\n", + mid_entry->multiRsp, mid_entry->multiEnd); if (mid_entry->resp_buf) { cifs_dump_detail(mid_entry->resp_buf); cifs_dump_mem("existing buf: ", @@ -603,7 +620,7 @@ static ssize_t cifs_security_flags_proc_write(struct file *file, global_secflags = CIFSSEC_MAX; return count; } else if (!isdigit(c)) { - cERROR(1, "invalid flag %c", c); + cifs_dbg(VFS, "invalid flag %c\n", c); return -EINVAL; } } @@ -611,16 +628,16 @@ static ssize_t cifs_security_flags_proc_write(struct file *file, flags = simple_strtoul(flags_string, NULL, 0); - cFYI(1, "sec flags 0x%x", flags); + cifs_dbg(FYI, "sec flags 0x%x\n", flags); if (flags <= 0) { - cERROR(1, "invalid security flags %s", flags_string); + cifs_dbg(VFS, "invalid security flags %s\n", flags_string); return -EINVAL; } if (flags & ~CIFSSEC_MASK) { - cERROR(1, "attempt to set unsupported security flags 0x%x", - flags & ~CIFSSEC_MASK); + cifs_dbg(VFS, "attempt to set unsupported security flags 0x%x\n", + flags & ~CIFSSEC_MASK); return -EINVAL; } /* flags look ok - update the global security flags for cifs module */ @@ -628,9 +645,9 @@ static ssize_t cifs_security_flags_proc_write(struct file *file, if (global_secflags & CIFSSEC_MUST_SIGN) { /* requiring signing implies signing is allowed */ global_secflags |= CIFSSEC_MAY_SIGN; - cFYI(1, "packet signing now required"); + cifs_dbg(FYI, "packet signing now required\n"); } else if ((global_secflags & CIFSSEC_MAY_SIGN) == 0) { - cFYI(1, "packet signing disabled"); + cifs_dbg(FYI, "packet signing disabled\n"); } /* BB should we turn on MAY flags for other MUST options? */ return count; diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h index 69ae3d3c3b31..c99b40fb609b 100644 --- a/fs/cifs/cifs_debug.h +++ b/fs/cifs/cifs_debug.h @@ -25,18 +25,20 @@ void cifs_dump_mem(char *label, void *data, int length); void cifs_dump_detail(void *); void cifs_dump_mids(struct TCP_Server_Info *); -#ifdef CONFIG_CIFS_DEBUG2 -#define DBG2 2 -#else -#define DBG2 0 -#endif extern int traceSMB; /* flag which enables the function below */ void dump_smb(void *, int); #define CIFS_INFO 0x01 #define CIFS_RC 0x02 #define CIFS_TIMER 0x04 +#define VFS 1 +#define FYI 2 extern int cifsFYI; +#ifdef CONFIG_CIFS_DEBUG2 +#define NOISY 4 +#else +#define NOISY 0 +#endif /* * debug ON @@ -44,31 +46,21 @@ extern int cifsFYI; */ #ifdef CONFIG_CIFS_DEBUG -/* information message: e.g., configuration, major event */ -#define cifsfyi(fmt, ...) \ -do { \ - if (cifsFYI & CIFS_INFO) \ - printk(KERN_DEBUG "%s: " fmt "\n", \ - __FILE__, ##__VA_ARGS__); \ -} while (0) - -#define cFYI(set, fmt, ...) \ -do { \ - if (set) \ - cifsfyi(fmt, ##__VA_ARGS__); \ -} while (0) +__printf(1, 2) void cifs_vfs_err(const char *fmt, ...); -#define cifswarn(fmt, ...) \ - printk(KERN_WARNING fmt "\n", ##__VA_ARGS__) - -/* error event message: e.g., i/o error */ -#define cifserror(fmt, ...) \ - printk(KERN_ERR "CIFS VFS: " fmt "\n", ##__VA_ARGS__); \ - -#define cERROR(set, fmt, ...) \ +/* information message: e.g., configuration, major event */ +#define cifs_dbg(type, fmt, ...) \ do { \ - if (set) \ - cifserror(fmt, ##__VA_ARGS__); \ + if (type == FYI) { \ + if (cifsFYI & CIFS_INFO) { \ + printk(KERN_DEBUG "%s: " fmt, \ + __FILE__, ##__VA_ARGS__); \ + } \ + } else if (type == VFS) { \ + cifs_vfs_err(fmt, ##__VA_ARGS__); \ + } else if (type == NOISY && type != 0) { \ + printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ + } \ } while (0) /* @@ -76,27 +68,11 @@ do { \ * --------- */ #else /* _CIFS_DEBUG */ -#define cifsfyi(fmt, ...) \ +#define cifs_dbg(type, fmt, ...) \ do { \ if (0) \ - printk(KERN_DEBUG "%s: " fmt "\n", \ - __FILE__, ##__VA_ARGS__); \ + printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ } while (0) -#define cFYI(set, fmt, ...) \ -do { \ - if (0 && set) \ - cifsfyi(fmt, ##__VA_ARGS__); \ -} while (0) -#define cifserror(fmt, ...) \ -do { \ - if (0) \ - printk(KERN_ERR "CIFS VFS: " fmt "\n", ##__VA_ARGS__); \ -} while (0) -#define cERROR(set, fmt, ...) \ -do { \ - if (0 && set) \ - cifserror(fmt, ##__VA_ARGS__); \ -} while (0) -#endif /* _CIFS_DEBUG */ +#endif #endif /* _H_CIFS_DEBUG */ diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 210fce2df308..8e33ec65847b 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -84,8 +84,8 @@ static char *cifs_get_share_name(const char *node_name) /* find server name end */ pSep = memchr(UNC+2, '\\', len-2); if (!pSep) { - cERROR(1, "%s: no server name end in node name: %s", - __func__, node_name); + cifs_dbg(VFS, "%s: no server name end in node name: %s\n", + __func__, node_name); kfree(UNC); return ERR_PTR(-EINVAL); } @@ -141,8 +141,8 @@ char *cifs_compose_mount_options(const char *sb_mountdata, rc = dns_resolve_server_name_to_ip(*devname, &srvIP); if (rc < 0) { - cFYI(1, "%s: Failed to resolve server part of %s to IP: %d", - __func__, *devname, rc); + cifs_dbg(FYI, "%s: Failed to resolve server part of %s to IP: %d\n", + __func__, *devname, rc); goto compose_mount_options_err; } @@ -216,8 +216,8 @@ char *cifs_compose_mount_options(const char *sb_mountdata, strcat(mountdata, fullpath + ref->path_consumed); } - /*cFYI(1, "%s: parent mountdata: %s", __func__,sb_mountdata);*/ - /*cFYI(1, "%s: submount mountdata: %s", __func__, mountdata );*/ + /*cifs_dbg(FYI, "%s: parent mountdata: %s\n", __func__, sb_mountdata);*/ + /*cifs_dbg(FYI, "%s: submount mountdata: %s\n", __func__, mountdata );*/ compose_mount_options_out: kfree(srvIP); @@ -260,11 +260,12 @@ static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb, static void dump_referral(const struct dfs_info3_param *ref) { - cFYI(1, "DFS: ref path: %s", ref->path_name); - cFYI(1, "DFS: node path: %s", ref->node_name); - cFYI(1, "DFS: fl: %hd, srv_type: %hd", ref->flags, ref->server_type); - cFYI(1, "DFS: ref_flags: %hd, path_consumed: %hd", ref->ref_flag, - ref->path_consumed); + cifs_dbg(FYI, "DFS: ref path: %s\n", ref->path_name); + cifs_dbg(FYI, "DFS: node path: %s\n", ref->node_name); + cifs_dbg(FYI, "DFS: fl: %hd, srv_type: %hd\n", + ref->flags, ref->server_type); + cifs_dbg(FYI, "DFS: ref_flags: %hd, path_consumed: %hd\n", + ref->ref_flag, ref->path_consumed); } /* @@ -283,7 +284,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) struct vfsmount *mnt; struct tcon_link *tlink; - cFYI(1, "in %s", __func__); + cifs_dbg(FYI, "in %s\n", __func__); BUG_ON(IS_ROOT(mntpt)); /* @@ -320,15 +321,15 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) /* connect to a node */ len = strlen(referrals[i].node_name); if (len < 2) { - cERROR(1, "%s: Net Address path too short: %s", - __func__, referrals[i].node_name); + cifs_dbg(VFS, "%s: Net Address path too short: %s\n", + __func__, referrals[i].node_name); mnt = ERR_PTR(-EINVAL); break; } mnt = cifs_dfs_do_refmount(cifs_sb, full_path, referrals + i); - cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__, - referrals[i].node_name, mnt); + cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n", + __func__, referrals[i].node_name, mnt); if (!IS_ERR(mnt)) goto success; } @@ -343,7 +344,7 @@ success: free_full_path: kfree(full_path); cdda_exit: - cFYI(1, "leaving %s" , __func__); + cifs_dbg(FYI, "leaving %s\n" , __func__); return mnt; } @@ -354,11 +355,11 @@ struct vfsmount *cifs_dfs_d_automount(struct path *path) { struct vfsmount *newmnt; - cFYI(1, "in %s", __func__); + cifs_dbg(FYI, "in %s\n", __func__); newmnt = cifs_dfs_do_automount(path->dentry); if (IS_ERR(newmnt)) { - cFYI(1, "leaving %s [automount failed]" , __func__); + cifs_dbg(FYI, "leaving %s [automount failed]\n" , __func__); return newmnt; } @@ -366,7 +367,7 @@ struct vfsmount *cifs_dfs_d_automount(struct path *path) mnt_set_expiry(newmnt, &cifs_dfs_automount_list); schedule_delayed_work(&cifs_dfs_automount_task, cifs_dfs_mountpoint_expiry_timeout); - cFYI(1, "leaving %s [ok]" , __func__); + cifs_dbg(FYI, "leaving %s [ok]\n" , __func__); return newmnt; } diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 10e774761299..a3e932547617 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -37,12 +37,11 @@ cifs_spnego_key_instantiate(struct key *key, struct key_preparsed_payload *prep) int ret; ret = -ENOMEM; - payload = kmalloc(prep->datalen, GFP_KERNEL); + payload = kmemdup(prep->data, prep->datalen, GFP_KERNEL); if (!payload) goto error; /* attach the data */ - memcpy(payload, prep->data, prep->datalen); key->payload.data = payload; ret = 0; @@ -164,7 +163,7 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo) dp = description + strlen(description); sprintf(dp, ";pid=0x%x", current->pid); - cFYI(1, "key description = %s", description); + cifs_dbg(FYI, "key description = %s\n", description); spnego_key = request_key(&cifs_spnego_key_type, description, ""); #ifdef CONFIG_CIFS_DEBUG2 diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index 71d5d0a5f6b2..0227b45ef00a 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -227,8 +227,8 @@ cifs_strtoUTF16(__le16 *to, const char *from, int len, for (i = 0; len && *from; i++, from += charlen, len -= charlen) { charlen = codepage->char2uni(from, len, &wchar_to); if (charlen < 1) { - cERROR(1, "strtoUTF16: char2uni of 0x%x returned %d", - *from, charlen); + cifs_dbg(VFS, "strtoUTF16: char2uni of 0x%x returned %d\n", + *from, charlen); /* A question mark */ wchar_to = 0x003f; charlen = 1; diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index f1e3f25fe004..51f5e0ee7237 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -63,11 +63,10 @@ cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep) key->datalen = prep->datalen; return 0; } - payload = kmalloc(prep->datalen, GFP_KERNEL); + payload = kmemdup(prep->data, prep->datalen, GFP_KERNEL); if (!payload) return -ENOMEM; - memcpy(payload, prep->data, prep->datalen); key->payload.data = payload; key->datalen = prep->datalen; return 0; @@ -219,13 +218,13 @@ id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid) sidkey = request_key(&cifs_idmap_key_type, desc, ""); if (IS_ERR(sidkey)) { rc = -EINVAL; - cFYI(1, "%s: Can't map %cid %u to a SID", __func__, - sidtype == SIDOWNER ? 'u' : 'g', cid); + cifs_dbg(FYI, "%s: Can't map %cid %u to a SID\n", + __func__, sidtype == SIDOWNER ? 'u' : 'g', cid); goto out_revert_creds; } else if (sidkey->datalen < CIFS_SID_BASE_SIZE) { rc = -EIO; - cFYI(1, "%s: Downcall contained malformed key " - "(datalen=%hu)", __func__, sidkey->datalen); + cifs_dbg(FYI, "%s: Downcall contained malformed key (datalen=%hu)\n", + __func__, sidkey->datalen); goto invalidate_key; } @@ -241,8 +240,8 @@ id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid) ksid_size = CIFS_SID_BASE_SIZE + (ksid->num_subauth * sizeof(__le32)); if (ksid_size > sidkey->datalen) { rc = -EIO; - cFYI(1, "%s: Downcall contained malformed key (datalen=%hu, " - "ksid_size=%u)", __func__, sidkey->datalen, ksid_size); + cifs_dbg(FYI, "%s: Downcall contained malformed key (datalen=%hu, ksid_size=%u)\n", + __func__, sidkey->datalen, ksid_size); goto invalidate_key; } @@ -274,8 +273,8 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, * Just return an error. */ if (unlikely(psid->num_subauth > SID_MAX_SUB_AUTHORITIES)) { - cFYI(1, "%s: %u subauthorities is too many!", __func__, - psid->num_subauth); + cifs_dbg(FYI, "%s: %u subauthorities is too many!\n", + __func__, psid->num_subauth); return -EIO; } @@ -287,8 +286,8 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, sidkey = request_key(&cifs_idmap_key_type, sidstr, ""); if (IS_ERR(sidkey)) { rc = -EINVAL; - cFYI(1, "%s: Can't map SID %s to a %cid", __func__, sidstr, - sidtype == SIDOWNER ? 'u' : 'g'); + cifs_dbg(FYI, "%s: Can't map SID %s to a %cid\n", + __func__, sidstr, sidtype == SIDOWNER ? 'u' : 'g'); goto out_revert_creds; } @@ -300,8 +299,8 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t)); if (sidkey->datalen != sizeof(uid_t)) { rc = -EIO; - cFYI(1, "%s: Downcall contained malformed key " - "(datalen=%hu)", __func__, sidkey->datalen); + cifs_dbg(FYI, "%s: Downcall contained malformed key (datalen=%hu)\n", + __func__, sidkey->datalen); key_invalidate(sidkey); goto out_key_put; } @@ -346,7 +345,8 @@ init_cifs_idmap(void) struct key *keyring; int ret; - cFYI(1, "Registering the %s key type", cifs_idmap_key_type.name); + cifs_dbg(FYI, "Registering the %s key type\n", + cifs_idmap_key_type.name); /* create an override credential set with a special thread keyring in * which requests are cached @@ -379,7 +379,7 @@ init_cifs_idmap(void) cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; root_cred = cred; - cFYI(1, "cifs idmap keyring: %d", key_serial(keyring)); + cifs_dbg(FYI, "cifs idmap keyring: %d\n", key_serial(keyring)); return 0; failed_put_key: @@ -395,7 +395,7 @@ exit_cifs_idmap(void) key_revoke(root_cred->thread_keyring); unregister_key_type(&cifs_idmap_key_type); put_cred(root_cred); - cFYI(1, "Unregistered %s key type", cifs_idmap_key_type.name); + cifs_dbg(FYI, "Unregistered %s key type\n", cifs_idmap_key_type.name); } /* copy ntsd, owner sid, and group sid from a security descriptor to another */ @@ -462,14 +462,14 @@ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode, *pbits_to_set &= ~S_IXUGO; return; } else if (type != ACCESS_ALLOWED) { - cERROR(1, "unknown access control type %d", type); + cifs_dbg(VFS, "unknown access control type %d\n", type); return; } /* else ACCESS_ALLOWED type */ if (flags & GENERIC_ALL) { *pmode |= (S_IRWXUGO & (*pbits_to_set)); - cFYI(DBG2, "all perms"); + cifs_dbg(NOISY, "all perms\n"); return; } if ((flags & GENERIC_WRITE) || @@ -482,7 +482,7 @@ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode, ((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS)) *pmode |= (S_IXUGO & (*pbits_to_set)); - cFYI(DBG2, "access flags 0x%x mode now 0x%x", flags, *pmode); + cifs_dbg(NOISY, "access flags 0x%x mode now 0x%x\n", flags, *pmode); return; } @@ -511,7 +511,8 @@ static void mode_to_access_flags(umode_t mode, umode_t bits_to_use, if (mode & S_IXUGO) *pace_flags |= SET_FILE_EXEC_RIGHTS; - cFYI(DBG2, "mode: 0x%x, access flags now 0x%x", mode, *pace_flags); + cifs_dbg(NOISY, "mode: 0x%x, access flags now 0x%x\n", + mode, *pace_flags); return; } @@ -551,24 +552,24 @@ static void dump_ace(struct cifs_ace *pace, char *end_of_acl) /* validate that we do not go past end of acl */ if (le16_to_cpu(pace->size) < 16) { - cERROR(1, "ACE too small %d", le16_to_cpu(pace->size)); + cifs_dbg(VFS, "ACE too small %d\n", le16_to_cpu(pace->size)); return; } if (end_of_acl < (char *)pace + le16_to_cpu(pace->size)) { - cERROR(1, "ACL too small to parse ACE"); + cifs_dbg(VFS, "ACL too small to parse ACE\n"); return; } num_subauth = pace->sid.num_subauth; if (num_subauth) { int i; - cFYI(1, "ACE revision %d num_auth %d type %d flags %d size %d", - pace->sid.revision, pace->sid.num_subauth, pace->type, - pace->flags, le16_to_cpu(pace->size)); + cifs_dbg(FYI, "ACE revision %d num_auth %d type %d flags %d size %d\n", + pace->sid.revision, pace->sid.num_subauth, pace->type, + pace->flags, le16_to_cpu(pace->size)); for (i = 0; i < num_subauth; ++i) { - cFYI(1, "ACE sub_auth[%d]: 0x%x", i, - le32_to_cpu(pace->sid.sub_auth[i])); + cifs_dbg(FYI, "ACE sub_auth[%d]: 0x%x\n", + i, le32_to_cpu(pace->sid.sub_auth[i])); } /* BB add length check to make sure that we do not have huge @@ -601,13 +602,13 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl, /* validate that we do not go past end of acl */ if (end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) { - cERROR(1, "ACL too small to parse DACL"); + cifs_dbg(VFS, "ACL too small to parse DACL\n"); return; } - cFYI(DBG2, "DACL revision %d size %d num aces %d", - le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size), - le32_to_cpu(pdacl->num_aces)); + cifs_dbg(NOISY, "DACL revision %d size %d num aces %d\n", + le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size), + le32_to_cpu(pdacl->num_aces)); /* reset rwx permissions for user/group/other. Also, if num_aces is 0 i.e. DACL has no ACEs, @@ -627,10 +628,8 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl, return; ppace = kmalloc(num_aces * sizeof(struct cifs_ace *), GFP_KERNEL); - if (!ppace) { - cERROR(1, "DACL memory allocation error"); + if (!ppace) return; - } for (i = 0; i < num_aces; ++i) { ppace[i] = (struct cifs_ace *) (acl_base + acl_size); @@ -703,25 +702,25 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl) /* validate that we do not go past end of ACL - sid must be at least 8 bytes long (assuming no sub-auths - e.g. the null SID */ if (end_of_acl < (char *)psid + 8) { - cERROR(1, "ACL too small to parse SID %p", psid); + cifs_dbg(VFS, "ACL too small to parse SID %p\n", psid); return -EINVAL; } #ifdef CONFIG_CIFS_DEBUG2 if (psid->num_subauth) { int i; - cFYI(1, "SID revision %d num_auth %d", - psid->revision, psid->num_subauth); + cifs_dbg(FYI, "SID revision %d num_auth %d\n", + psid->revision, psid->num_subauth); for (i = 0; i < psid->num_subauth; i++) { - cFYI(1, "SID sub_auth[%d]: 0x%x ", i, - le32_to_cpu(psid->sub_auth[i])); + cifs_dbg(FYI, "SID sub_auth[%d]: 0x%x\n", + i, le32_to_cpu(psid->sub_auth[i])); } /* BB add length check to make sure that we do not have huge num auths and therefore go off the end */ - cFYI(1, "RID 0x%x", - le32_to_cpu(psid->sub_auth[psid->num_subauth-1])); + cifs_dbg(FYI, "RID 0x%x\n", + le32_to_cpu(psid->sub_auth[psid->num_subauth-1])); } #endif @@ -748,31 +747,33 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb, le32_to_cpu(pntsd->gsidoffset)); dacloffset = le32_to_cpu(pntsd->dacloffset); dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset); - cFYI(DBG2, "revision %d type 0x%x ooffset 0x%x goffset 0x%x " - "sacloffset 0x%x dacloffset 0x%x", + cifs_dbg(NOISY, "revision %d type 0x%x ooffset 0x%x goffset 0x%x sacloffset 0x%x dacloffset 0x%x\n", pntsd->revision, pntsd->type, le32_to_cpu(pntsd->osidoffset), le32_to_cpu(pntsd->gsidoffset), le32_to_cpu(pntsd->sacloffset), dacloffset); /* cifs_dump_mem("owner_sid: ", owner_sid_ptr, 64); */ rc = parse_sid(owner_sid_ptr, end_of_acl); if (rc) { - cFYI(1, "%s: Error %d parsing Owner SID", __func__, rc); + cifs_dbg(FYI, "%s: Error %d parsing Owner SID\n", __func__, rc); return rc; } rc = sid_to_id(cifs_sb, owner_sid_ptr, fattr, SIDOWNER); if (rc) { - cFYI(1, "%s: Error %d mapping Owner SID to uid", __func__, rc); + cifs_dbg(FYI, "%s: Error %d mapping Owner SID to uid\n", + __func__, rc); return rc; } rc = parse_sid(group_sid_ptr, end_of_acl); if (rc) { - cFYI(1, "%s: Error %d mapping Owner SID to gid", __func__, rc); + cifs_dbg(FYI, "%s: Error %d mapping Owner SID to gid\n", + __func__, rc); return rc; } rc = sid_to_id(cifs_sb, group_sid_ptr, fattr, SIDGROUP); if (rc) { - cFYI(1, "%s: Error %d mapping Group SID to gid", __func__, rc); + cifs_dbg(FYI, "%s: Error %d mapping Group SID to gid\n", + __func__, rc); return rc; } @@ -780,7 +781,7 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb, parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr, group_sid_ptr, fattr); else - cFYI(1, "no ACL"); /* BB grant all or default perms? */ + cifs_dbg(FYI, "no ACL\n"); /* BB grant all or default perms? */ return rc; } @@ -830,8 +831,8 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, id = from_kuid(&init_user_ns, uid); rc = id_to_sid(id, SIDOWNER, nowner_sid_ptr); if (rc) { - cFYI(1, "%s: Mapping error %d for owner id %d", - __func__, rc, id); + cifs_dbg(FYI, "%s: Mapping error %d for owner id %d\n", + __func__, rc, id); kfree(nowner_sid_ptr); return rc; } @@ -850,8 +851,8 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, id = from_kgid(&init_user_ns, gid); rc = id_to_sid(id, SIDGROUP, ngroup_sid_ptr); if (rc) { - cFYI(1, "%s: Mapping error %d for group id %d", - __func__, rc, id); + cifs_dbg(FYI, "%s: Mapping error %d for group id %d\n", + __func__, rc, id); kfree(ngroup_sid_ptr); return rc; } @@ -881,7 +882,7 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, cifs_put_tlink(tlink); - cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen); + cifs_dbg(FYI, "%s: rc = %d ACL len %d\n", __func__, rc, *pacllen); if (rc) return ERR_PTR(rc); return pntsd; @@ -918,7 +919,7 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, cifs_put_tlink(tlink); free_xid(xid); - cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen); + cifs_dbg(FYI, "%s: rc = %d ACL len %d\n", __func__, rc, *pacllen); if (rc) return ERR_PTR(rc); return pntsd; @@ -972,12 +973,12 @@ int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, create_options, &fid, &oplock, NULL, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc) { - cERROR(1, "Unable to open file to set ACL"); + cifs_dbg(VFS, "Unable to open file to set ACL\n"); goto out; } rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen, aclflag); - cFYI(DBG2, "SetCIFSACL rc = %d", rc); + cifs_dbg(NOISY, "SetCIFSACL rc = %d\n", rc); CIFSSMBClose(xid, tcon, fid); out: @@ -995,7 +996,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, u32 acllen = 0; int rc = 0; - cFYI(DBG2, "converting ACL to mode for %s", path); + cifs_dbg(NOISY, "converting ACL to mode for %s\n", path); if (pfid) pntsd = get_cifs_acl_by_fid(cifs_sb, *pfid, &acllen); @@ -1005,12 +1006,12 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */ if (IS_ERR(pntsd)) { rc = PTR_ERR(pntsd); - cERROR(1, "%s: error %d getting sec desc", __func__, rc); + cifs_dbg(VFS, "%s: error %d getting sec desc\n", __func__, rc); } else { rc = parse_sec_desc(cifs_sb, pntsd, acllen, fattr); kfree(pntsd); if (rc) - cERROR(1, "parse sec desc failed rc = %d", rc); + cifs_dbg(VFS, "parse sec desc failed rc = %d\n", rc); } return rc; @@ -1027,13 +1028,13 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */ struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */ - cFYI(DBG2, "set ACL from mode for %s", path); + cifs_dbg(NOISY, "set ACL from mode for %s\n", path); /* Get the security descriptor */ pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen); if (IS_ERR(pntsd)) { rc = PTR_ERR(pntsd); - cERROR(1, "%s: error %d getting sec desc", __func__, rc); + cifs_dbg(VFS, "%s: error %d getting sec desc\n", __func__, rc); goto out; } @@ -1046,7 +1047,6 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, secdesclen = max_t(u32, secdesclen, DEFAULT_SEC_DESC_LEN); pnntsd = kmalloc(secdesclen, GFP_KERNEL); if (!pnntsd) { - cERROR(1, "Unable to allocate security descriptor"); kfree(pntsd); return -ENOMEM; } @@ -1054,12 +1054,12 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid, &aclflag); - cFYI(DBG2, "build_sec_desc rc: %d", rc); + cifs_dbg(NOISY, "build_sec_desc rc: %d\n", rc); if (!rc) { /* Set the security descriptor */ rc = set_cifs_acl(pnntsd, secdesclen, inode, path, aclflag); - cFYI(DBG2, "set_cifs_acl rc: %d", rc); + cifs_dbg(NOISY, "set_cifs_acl rc: %d\n", rc); } kfree(pnntsd); diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 652f5051be09..71436d1fca13 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -50,20 +50,20 @@ static int cifs_calc_signature(struct smb_rqst *rqst, return -EINVAL; if (!server->secmech.sdescmd5) { - cERROR(1, "%s: Can't generate signature", __func__); + cifs_dbg(VFS, "%s: Can't generate signature\n", __func__); return -1; } rc = crypto_shash_init(&server->secmech.sdescmd5->shash); if (rc) { - cERROR(1, "%s: Could not init md5", __func__); + cifs_dbg(VFS, "%s: Could not init md5\n", __func__); return rc; } rc = crypto_shash_update(&server->secmech.sdescmd5->shash, server->session_key.response, server->session_key.len); if (rc) { - cERROR(1, "%s: Could not update with response", __func__); + cifs_dbg(VFS, "%s: Could not update with response\n", __func__); return rc; } @@ -71,7 +71,7 @@ static int cifs_calc_signature(struct smb_rqst *rqst, if (iov[i].iov_len == 0) continue; if (iov[i].iov_base == NULL) { - cERROR(1, "null iovec entry"); + cifs_dbg(VFS, "null iovec entry\n"); return -EIO; } /* The first entry includes a length field (which does not get @@ -88,8 +88,8 @@ static int cifs_calc_signature(struct smb_rqst *rqst, iov[i].iov_base, iov[i].iov_len); } if (rc) { - cERROR(1, "%s: Could not update with payload", - __func__); + cifs_dbg(VFS, "%s: Could not update with payload\n", + __func__); return rc; } } @@ -106,7 +106,7 @@ static int cifs_calc_signature(struct smb_rqst *rqst, rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature); if (rc) - cERROR(1, "%s: Could not generate md5 hash", __func__); + cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); return rc; } @@ -135,8 +135,8 @@ int cifs_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server, cpu_to_le32(server->sequence_number); cifs_pdu->Signature.Sequence.Reserved = 0; - *pexpected_response_sequence_number = server->sequence_number++; - server->sequence_number++; + *pexpected_response_sequence_number = ++server->sequence_number; + ++server->sequence_number; rc = cifs_calc_signature(rqst, server, smb_signature); if (rc) @@ -196,8 +196,8 @@ int cifs_verify_signature(struct smb_rqst *rqst, /* Do not need to verify session setups with signature "BSRSPYL " */ if (memcmp(cifs_pdu->Signature.SecuritySignature, "BSRSPYL ", 8) == 0) - cFYI(1, "dummy signature received for smb command 0x%x", - cifs_pdu->Command); + cifs_dbg(FYI, "dummy signature received for smb command 0x%x\n", + cifs_pdu->Command); /* save off the origiginal signature so we can modify the smb and check its signature against what the server sent */ @@ -235,30 +235,30 @@ int setup_ntlm_response(struct cifs_ses *ses, const struct nls_table *nls_cp) return -EINVAL; ses->auth_key.response = kmalloc(temp_len, GFP_KERNEL); - if (!ses->auth_key.response) { - cERROR(1, "NTLM can't allocate (%u bytes) memory", temp_len); + if (!ses->auth_key.response) return -ENOMEM; - } + ses->auth_key.len = temp_len; rc = SMBNTencrypt(ses->password, ses->server->cryptkey, ses->auth_key.response + CIFS_SESS_KEY_SIZE, nls_cp); if (rc) { - cFYI(1, "%s Can't generate NTLM response, error: %d", - __func__, rc); + cifs_dbg(FYI, "%s Can't generate NTLM response, error: %d\n", + __func__, rc); return rc; } rc = E_md4hash(ses->password, temp_key, nls_cp); if (rc) { - cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc); + cifs_dbg(FYI, "%s Can't generate NT hash, error: %d\n", + __func__, rc); return rc; } rc = mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE); if (rc) - cFYI(1, "%s Can't generate NTLM session key, error: %d", - __func__, rc); + cifs_dbg(FYI, "%s Can't generate NTLM session key, error: %d\n", + __func__, rc); return rc; } @@ -334,7 +334,6 @@ build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp) ses->auth_key.response = kzalloc(ses->auth_key.len, GFP_KERNEL); if (!ses->auth_key.response) { ses->auth_key.len = 0; - cERROR(1, "Challenge target info allocation failure"); return -ENOMEM; } @@ -420,7 +419,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, wchar_t *server; if (!ses->server->secmech.sdeschmacmd5) { - cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash"); + cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__); return -1; } @@ -430,13 +429,13 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash, CIFS_NTHASH_SIZE); if (rc) { - cERROR(1, "%s: Could not set NT Hash as a key", __func__); + cifs_dbg(VFS, "%s: Could not set NT Hash as a key\n", __func__); return rc; } rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); if (rc) { - cERROR(1, "calc_ntlmv2_hash: could not init hmacmd5"); + cifs_dbg(VFS, "%s: could not init hmacmd5\n", __func__); return rc; } @@ -444,7 +443,6 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, len = ses->user_name ? strlen(ses->user_name) : 0; user = kmalloc(2 + (len * 2), GFP_KERNEL); if (user == NULL) { - cERROR(1, "calc_ntlmv2_hash: user mem alloc failure"); rc = -ENOMEM; return rc; } @@ -460,7 +458,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, (char *)user, 2 * len); kfree(user); if (rc) { - cERROR(1, "%s: Could not update with user", __func__); + cifs_dbg(VFS, "%s: Could not update with user\n", __func__); return rc; } @@ -470,7 +468,6 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, domain = kmalloc(2 + (len * 2), GFP_KERNEL); if (domain == NULL) { - cERROR(1, "calc_ntlmv2_hash: domain mem alloc failure"); rc = -ENOMEM; return rc; } @@ -481,8 +478,8 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, (char *)domain, 2 * len); kfree(domain); if (rc) { - cERROR(1, "%s: Could not update with domain", - __func__); + cifs_dbg(VFS, "%s: Could not update with domain\n", + __func__); return rc; } } else if (ses->serverName) { @@ -490,7 +487,6 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, server = kmalloc(2 + (len * 2), GFP_KERNEL); if (server == NULL) { - cERROR(1, "calc_ntlmv2_hash: server mem alloc failure"); rc = -ENOMEM; return rc; } @@ -501,8 +497,8 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, (char *)server, 2 * len); kfree(server); if (rc) { - cERROR(1, "%s: Could not update with server", - __func__); + cifs_dbg(VFS, "%s: Could not update with server\n", + __func__); return rc; } } @@ -510,7 +506,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, ntlmv2_hash); if (rc) - cERROR(1, "%s: Could not generate md5 hash", __func__); + cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); return rc; } @@ -522,20 +518,21 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) unsigned int offset = CIFS_SESS_KEY_SIZE + 8; if (!ses->server->secmech.sdeschmacmd5) { - cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash"); + cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__); return -1; } rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); if (rc) { - cERROR(1, "%s: Could not set NTLMV2 Hash as a key", __func__); + cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n", + __func__); return rc; } rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); if (rc) { - cERROR(1, "CalcNTLMv2_response: could not init hmacmd5"); + cifs_dbg(VFS, "%s: could not init hmacmd5\n", __func__); return rc; } @@ -548,14 +545,14 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, ses->auth_key.response + offset, ses->auth_key.len - offset); if (rc) { - cERROR(1, "%s: Could not update with response", __func__); + cifs_dbg(VFS, "%s: Could not update with response\n", __func__); return rc; } rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, ses->auth_key.response + CIFS_SESS_KEY_SIZE); if (rc) - cERROR(1, "%s: Could not generate md5 hash", __func__); + cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); return rc; } @@ -575,14 +572,15 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) if (!ses->domainName) { rc = find_domain_name(ses, nls_cp); if (rc) { - cERROR(1, "error %d finding domain name", rc); + cifs_dbg(VFS, "error %d finding domain name\n", + rc); goto setup_ntlmv2_rsp_ret; } } } else { rc = build_avpair_blob(ses, nls_cp); if (rc) { - cERROR(1, "error %d building av pair blob", rc); + cifs_dbg(VFS, "error %d building av pair blob\n", rc); goto setup_ntlmv2_rsp_ret; } } @@ -595,7 +593,6 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) if (!ses->auth_key.response) { rc = ENOMEM; ses->auth_key.len = 0; - cERROR(1, "%s: Can't allocate auth blob", __func__); goto setup_ntlmv2_rsp_ret; } ses->auth_key.len += baselen; @@ -613,14 +610,14 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) /* calculate ntlmv2_hash */ rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp); if (rc) { - cERROR(1, "could not get v2 hash rc %d", rc); + cifs_dbg(VFS, "could not get v2 hash rc %d\n", rc); goto setup_ntlmv2_rsp_ret; } /* calculate first part of the client response (CR1) */ rc = CalcNTLMv2_response(ses, ntlmv2_hash); if (rc) { - cERROR(1, "Could not calculate CR1 rc: %d", rc); + cifs_dbg(VFS, "Could not calculate CR1 rc: %d\n", rc); goto setup_ntlmv2_rsp_ret; } @@ -628,13 +625,14 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); if (rc) { - cERROR(1, "%s: Could not set NTLMV2 Hash as a key", __func__); + cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n", + __func__); goto setup_ntlmv2_rsp_ret; } rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); if (rc) { - cERROR(1, "%s: Could not init hmacmd5", __func__); + cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__); goto setup_ntlmv2_rsp_ret; } @@ -642,14 +640,14 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) ses->auth_key.response + CIFS_SESS_KEY_SIZE, CIFS_HMAC_MD5_HASH_SIZE); if (rc) { - cERROR(1, "%s: Could not update with response", __func__); + cifs_dbg(VFS, "%s: Could not update with response\n", __func__); goto setup_ntlmv2_rsp_ret; } rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, ses->auth_key.response); if (rc) - cERROR(1, "%s: Could not generate md5 hash", __func__); + cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); setup_ntlmv2_rsp_ret: kfree(tiblob); @@ -671,7 +669,7 @@ calc_seckey(struct cifs_ses *ses) tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm_arc4)) { rc = PTR_ERR(tfm_arc4); - cERROR(1, "could not allocate crypto API arc4"); + cifs_dbg(VFS, "could not allocate crypto API arc4\n"); return rc; } @@ -680,7 +678,8 @@ calc_seckey(struct cifs_ses *ses) rc = crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response, CIFS_SESS_KEY_SIZE); if (rc) { - cERROR(1, "%s: Could not set response as a key", __func__); + cifs_dbg(VFS, "%s: Could not set response as a key\n", + __func__); return rc; } @@ -689,7 +688,7 @@ calc_seckey(struct cifs_ses *ses) rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, CIFS_CPHTXT_SIZE); if (rc) { - cERROR(1, "could not encrypt session key rc: %d", rc); + cifs_dbg(VFS, "could not encrypt session key rc: %d\n", rc); crypto_free_blkcipher(tfm_arc4); return rc; } @@ -731,20 +730,20 @@ cifs_crypto_shash_allocate(struct TCP_Server_Info *server) server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0); if (IS_ERR(server->secmech.hmacmd5)) { - cERROR(1, "could not allocate crypto hmacmd5"); + cifs_dbg(VFS, "could not allocate crypto hmacmd5\n"); return PTR_ERR(server->secmech.hmacmd5); } server->secmech.md5 = crypto_alloc_shash("md5", 0, 0); if (IS_ERR(server->secmech.md5)) { - cERROR(1, "could not allocate crypto md5"); + cifs_dbg(VFS, "could not allocate crypto md5\n"); rc = PTR_ERR(server->secmech.md5); goto crypto_allocate_md5_fail; } server->secmech.hmacsha256 = crypto_alloc_shash("hmac(sha256)", 0, 0); if (IS_ERR(server->secmech.hmacsha256)) { - cERROR(1, "could not allocate crypto hmacsha256\n"); + cifs_dbg(VFS, "could not allocate crypto hmacsha256\n"); rc = PTR_ERR(server->secmech.hmacsha256); goto crypto_allocate_hmacsha256_fail; } @@ -753,7 +752,6 @@ cifs_crypto_shash_allocate(struct TCP_Server_Info *server) crypto_shash_descsize(server->secmech.hmacmd5); server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL); if (!server->secmech.sdeschmacmd5) { - cERROR(1, "cifs_crypto_shash_allocate: can't alloc hmacmd5"); rc = -ENOMEM; goto crypto_allocate_hmacmd5_sdesc_fail; } @@ -764,7 +762,6 @@ cifs_crypto_shash_allocate(struct TCP_Server_Info *server) crypto_shash_descsize(server->secmech.md5); server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL); if (!server->secmech.sdescmd5) { - cERROR(1, "cifs_crypto_shash_allocate: can't alloc md5"); rc = -ENOMEM; goto crypto_allocate_md5_sdesc_fail; } @@ -775,7 +772,6 @@ cifs_crypto_shash_allocate(struct TCP_Server_Info *server) crypto_shash_descsize(server->secmech.hmacsha256); server->secmech.sdeschmacsha256 = kmalloc(size, GFP_KERNEL); if (!server->secmech.sdeschmacsha256) { - cERROR(1, "%s: Can't alloc hmacsha256\n", __func__); rc = -ENOMEM; goto crypto_allocate_hmacsha256_sdesc_fail; } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 1a052c0eee8e..72e4efee1389 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -91,6 +91,30 @@ struct workqueue_struct *cifsiod_wq; __u8 cifs_client_guid[SMB2_CLIENT_GUID_SIZE]; #endif +/* + * Bumps refcount for cifs super block. + * Note that it should be only called if a referece to VFS super block is + * already held, e.g. in open-type syscalls context. Otherwise it can race with + * atomic_dec_and_test in deactivate_locked_super. + */ +void +cifs_sb_active(struct super_block *sb) +{ + struct cifs_sb_info *server = CIFS_SB(sb); + + if (atomic_inc_return(&server->active) == 1) + atomic_inc(&sb->s_active); +} + +void +cifs_sb_deactive(struct super_block *sb) +{ + struct cifs_sb_info *server = CIFS_SB(sb); + + if (atomic_dec_and_test(&server->active)) + deactivate_super(sb); +} + static int cifs_read_super(struct super_block *sb) { @@ -137,7 +161,7 @@ cifs_read_super(struct super_block *sb) #ifdef CONFIG_CIFS_NFSD_EXPORT if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { - cFYI(1, "export ops supported"); + cifs_dbg(FYI, "export ops supported\n"); sb->s_export_op = &cifs_export_ops; } #endif /* CONFIG_CIFS_NFSD_EXPORT */ @@ -145,7 +169,7 @@ cifs_read_super(struct super_block *sb) return 0; out_no_root: - cERROR(1, "cifs_read_super: get root inode failed"); + cifs_dbg(VFS, "%s: get root inode failed\n", __func__); return rc; } @@ -478,7 +502,7 @@ static void cifs_umount_begin(struct super_block *sb) /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */ /* cancel_notify_requests(tcon); */ if (tcon->ses && tcon->ses->server) { - cFYI(1, "wake up tasks now - umount begin not complete"); + cifs_dbg(FYI, "wake up tasks now - umount begin not complete\n"); wake_up_all(&tcon->ses->server->request_q); wake_up_all(&tcon->ses->server->response_q); msleep(1); /* yield */ @@ -549,7 +573,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) if (full_path == NULL) return ERR_PTR(-ENOMEM); - cFYI(1, "Get root dentry for %s", full_path); + cifs_dbg(FYI, "Get root dentry for %s\n", full_path); sep = CIFS_DIR_SEP(cifs_sb); dentry = dget(sb->s_root); @@ -608,7 +632,7 @@ cifs_do_mount(struct file_system_type *fs_type, struct cifs_mnt_data mnt_data; struct dentry *root; - cFYI(1, "Devname: %s flags: %d ", dev_name, flags); + cifs_dbg(FYI, "Devname: %s flags: %d\n", dev_name, flags); volume_info = cifs_get_volume_info((char *)data, dev_name); if (IS_ERR(volume_info)) @@ -631,7 +655,8 @@ cifs_do_mount(struct file_system_type *fs_type, rc = cifs_mount(cifs_sb, volume_info); if (rc) { if (!(flags & MS_SILENT)) - cERROR(1, "cifs_mount failed w/return code = %d", rc); + cifs_dbg(VFS, "cifs_mount failed w/return code = %d\n", + rc); root = ERR_PTR(rc); goto out_mountdata; } @@ -651,7 +676,7 @@ cifs_do_mount(struct file_system_type *fs_type, } if (sb->s_root) { - cFYI(1, "Use existing superblock"); + cifs_dbg(FYI, "Use existing superblock\n"); cifs_umount(cifs_sb); } else { rc = cifs_read_super(sb); @@ -667,7 +692,7 @@ cifs_do_mount(struct file_system_type *fs_type, if (IS_ERR(root)) goto out_super; - cFYI(1, "dentry root is: %p", root); + cifs_dbg(FYI, "dentry root is: %p\n", root); goto out; out_super: @@ -699,7 +724,8 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, rc = filemap_fdatawrite(inode->i_mapping); if (rc) - cFYI(1, "cifs_file_aio_write: %d rc on %p inode", rc, inode); + cifs_dbg(FYI, "cifs_file_aio_write: %d rc on %p inode\n", + rc, inode); return written; } @@ -777,6 +803,7 @@ struct file_system_type cifs_fs_type = { .kill_sb = cifs_kill_sb, /* .fs_flags */ }; +MODULE_ALIAS_FS("cifs"); const struct inode_operations cifs_dir_inode_ops = { .create = cifs_create, .atomic_open = cifs_atomic_open, @@ -1005,7 +1032,10 @@ cifs_init_request_bufs(void) } else { CIFSMaxBufSize &= 0x1FE00; /* Round size to even 512 byte mult*/ } -/* cERROR(1, "CIFSMaxBufSize %d 0x%x",CIFSMaxBufSize,CIFSMaxBufSize); */ +/* + cifs_dbg(VFS, "CIFSMaxBufSize %d 0x%x\n", + CIFSMaxBufSize, CIFSMaxBufSize); +*/ cifs_req_cachep = kmem_cache_create("cifs_request", CIFSMaxBufSize + max_hdr_size, 0, SLAB_HWCACHE_ALIGN, NULL); @@ -1016,7 +1046,7 @@ cifs_init_request_bufs(void) cifs_min_rcv = 1; else if (cifs_min_rcv > 64) { cifs_min_rcv = 64; - cERROR(1, "cifs_min_rcv set to maximum (64)"); + cifs_dbg(VFS, "cifs_min_rcv set to maximum (64)\n"); } cifs_req_poolp = mempool_create_slab_pool(cifs_min_rcv, @@ -1047,7 +1077,7 @@ cifs_init_request_bufs(void) cifs_min_small = 2; else if (cifs_min_small > 256) { cifs_min_small = 256; - cFYI(1, "cifs_min_small set to maximum (256)"); + cifs_dbg(FYI, "cifs_min_small set to maximum (256)\n"); } cifs_sm_req_poolp = mempool_create_slab_pool(cifs_min_small, @@ -1138,10 +1168,11 @@ init_cifs(void) if (cifs_max_pending < 2) { cifs_max_pending = 2; - cFYI(1, "cifs_max_pending set to min of 2"); + cifs_dbg(FYI, "cifs_max_pending set to min of 2\n"); } else if (cifs_max_pending > CIFS_MAX_REQ) { cifs_max_pending = CIFS_MAX_REQ; - cFYI(1, "cifs_max_pending set to max of %u", CIFS_MAX_REQ); + cifs_dbg(FYI, "cifs_max_pending set to max of %u\n", + CIFS_MAX_REQ); } cifsiod_wq = alloc_workqueue("cifsiod", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); @@ -1210,7 +1241,7 @@ out_clean_proc: static void __exit exit_cifs(void) { - cFYI(DBG2, "exit_cifs"); + cifs_dbg(NOISY, "exit_cifs\n"); unregister_filesystem(&cifs_fs_type); cifs_dfs_release_automount_timer(); #ifdef CONFIG_CIFS_ACL diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 7163419cecd9..0e32c3446ce9 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -41,6 +41,10 @@ extern struct file_system_type cifs_fs_type; extern const struct address_space_operations cifs_addr_ops; extern const struct address_space_operations cifs_addr_ops_smallbuf; +/* Functions related to super block operations */ +extern void cifs_sb_active(struct super_block *sb); +extern void cifs_sb_deactive(struct super_block *sb); + /* Functions related to inodes */ extern const struct inode_operations cifs_dir_inode_ops; extern struct inode *cifs_root_iget(struct super_block *); diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index f450f0683ddd..dda188a94332 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -45,17 +45,17 @@ extern void _free_xid(unsigned int); #define get_xid() \ ({ \ unsigned int __xid = _get_xid(); \ - cFYI(1, "CIFS VFS: in %s as Xid: %u with uid: %d", \ - __func__, __xid, \ - from_kuid(&init_user_ns, current_fsuid())); \ + cifs_dbg(FYI, "CIFS VFS: in %s as Xid: %u with uid: %d\n", \ + __func__, __xid, \ + from_kuid(&init_user_ns, current_fsuid())); \ __xid; \ }) #define free_xid(curr_xid) \ do { \ _free_xid(curr_xid); \ - cFYI(1, "CIFS VFS: leaving %s (xid = %u) rc = %d", \ - __func__, curr_xid, (int)rc); \ + cifs_dbg(FYI, "CIFS VFS: leaving %s (xid = %u) rc = %d\n", \ + __func__, curr_xid, (int)rc); \ } while (0) extern int init_cifs_idmap(void); extern void exit_cifs_idmap(void); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 7353bc5d73d7..a58dc77cc443 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -139,8 +139,8 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) if (smb_command != SMB_COM_WRITE_ANDX && smb_command != SMB_COM_OPEN_ANDX && smb_command != SMB_COM_TREE_DISCONNECT) { - cFYI(1, "can not send cmd %d while umounting", - smb_command); + cifs_dbg(FYI, "can not send cmd %d while umounting\n", + smb_command); return -ENODEV; } } @@ -163,7 +163,7 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) * back on-line */ if (!tcon->retry) { - cFYI(1, "gave up waiting on reconnect in smb_init"); + cifs_dbg(FYI, "gave up waiting on reconnect in smb_init\n"); return -EHOSTDOWN; } } @@ -191,7 +191,7 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) cifs_mark_open_files_invalid(tcon); rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage); mutex_unlock(&ses->session_mutex); - cFYI(1, "reconnect tcon rc = %d", rc); + cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc); if (rc) goto out; @@ -396,7 +396,7 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses) else /* if override flags set only sign/seal OR them with global auth */ secFlags = global_secflags | ses->overrideSecFlg; - cFYI(1, "secFlags 0x%x", secFlags); + cifs_dbg(FYI, "secFlags 0x%x\n", secFlags); pSMB->hdr.Mid = get_next_mid(server); pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS); @@ -404,12 +404,12 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses) if ((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5) pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) { - cFYI(1, "Kerberos only mechanism, enable extended security"); + cifs_dbg(FYI, "Kerberos only mechanism, enable extended security\n"); pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; } else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP) pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) { - cFYI(1, "NTLMSSP only mechanism, enable extended security"); + cifs_dbg(FYI, "NTLMSSP only mechanism, enable extended security\n"); pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; } @@ -428,7 +428,7 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses) goto neg_err_exit; server->dialect = le16_to_cpu(pSMBr->DialectIndex); - cFYI(1, "Dialect: %d", server->dialect); + cifs_dbg(FYI, "Dialect: %d\n", server->dialect); /* Check wct = 1 error case */ if ((pSMBr->hdr.WordCount < 13) || (server->dialect == BAD_PROT)) { /* core returns wct = 1, but we do not ask for core - otherwise @@ -447,8 +447,7 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses) (secFlags & CIFSSEC_MAY_PLNTXT)) server->secType = LANMAN; else { - cERROR(1, "mount failed weak security disabled" - " in /proc/fs/cifs/SecurityFlags"); + cifs_dbg(VFS, "mount failed weak security disabled in /proc/fs/cifs/SecurityFlags\n"); rc = -EOPNOTSUPP; goto neg_err_exit; } @@ -482,9 +481,9 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses) utc = CURRENT_TIME; ts = cnvrtDosUnixTm(rsp->SrvTime.Date, rsp->SrvTime.Time, 0); - cFYI(1, "SrvTime %d sec since 1970 (utc: %d) diff: %d", - (int)ts.tv_sec, (int)utc.tv_sec, - (int)(utc.tv_sec - ts.tv_sec)); + cifs_dbg(FYI, "SrvTime %d sec since 1970 (utc: %d) diff: %d\n", + (int)ts.tv_sec, (int)utc.tv_sec, + (int)(utc.tv_sec - ts.tv_sec)); val = (int)(utc.tv_sec - ts.tv_sec); seconds = abs(val); result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ; @@ -498,7 +497,7 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses) server->timeAdj = (int)tmp; server->timeAdj *= 60; /* also in seconds */ } - cFYI(1, "server->timeAdj: %d seconds", server->timeAdj); + cifs_dbg(FYI, "server->timeAdj: %d seconds\n", server->timeAdj); /* BB get server time for time conversions and add @@ -513,14 +512,13 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses) goto neg_err_exit; } - cFYI(1, "LANMAN negotiated"); + cifs_dbg(FYI, "LANMAN negotiated\n"); /* we will not end up setting signing flags - as no signing was in LANMAN and server did not return the flags on */ goto signing_check; #else /* weak security disabled */ } else if (pSMBr->hdr.WordCount == 13) { - cERROR(1, "mount failed, cifs module not built " - "with CIFS_WEAK_PW_HASH support"); + cifs_dbg(VFS, "mount failed, cifs module not built with CIFS_WEAK_PW_HASH support\n"); rc = -EOPNOTSUPP; #endif /* WEAK_PW_HASH */ goto neg_err_exit; @@ -532,14 +530,13 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses) /* else wct == 17 NTLM */ server->sec_mode = pSMBr->SecurityMode; if ((server->sec_mode & SECMODE_USER) == 0) - cFYI(1, "share mode security"); + cifs_dbg(FYI, "share mode security\n"); if ((server->sec_mode & SECMODE_PW_ENCRYPT) == 0) #ifdef CONFIG_CIFS_WEAK_PW_HASH if ((secFlags & CIFSSEC_MAY_PLNTXT) == 0) #endif /* CIFS_WEAK_PW_HASH */ - cERROR(1, "Server requests plain text password" - " but client support disabled"); + cifs_dbg(VFS, "Server requests plain text password but client support disabled\n"); if ((secFlags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2) server->secType = NTLMv2; @@ -555,7 +552,7 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses) server->secType = LANMAN; else { rc = -EOPNOTSUPP; - cERROR(1, "Invalid security type"); + cifs_dbg(VFS, "Invalid security type\n"); goto neg_err_exit; } /* else ... any others ...? */ @@ -568,7 +565,7 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses) /* probably no need to store and check maxvcs */ server->maxBuf = le32_to_cpu(pSMBr->MaxBufferSize); server->max_rw = le32_to_cpu(pSMBr->MaxRawSize); - cFYI(DBG2, "Max buf = %d", ses->server->maxBuf); + cifs_dbg(NOISY, "Max buf = %d\n", ses->server->maxBuf); server->capabilities = le32_to_cpu(pSMBr->Capabilities); server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone); server->timeAdj *= 60; @@ -590,7 +587,7 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses) if (memcmp(server->server_GUID, pSMBr->u.extended_response. GUID, 16) != 0) { - cFYI(1, "server UID changed"); + cifs_dbg(FYI, "server UID changed\n"); memcpy(server->server_GUID, pSMBr->u.extended_response.GUID, 16); @@ -633,21 +630,19 @@ signing_check: if ((secFlags & CIFSSEC_MAY_SIGN) == 0) { /* MUST_SIGN already includes the MAY_SIGN FLAG so if this is zero it means that signing is disabled */ - cFYI(1, "Signing disabled"); + cifs_dbg(FYI, "Signing disabled\n"); if (server->sec_mode & SECMODE_SIGN_REQUIRED) { - cERROR(1, "Server requires " - "packet signing to be enabled in " - "/proc/fs/cifs/SecurityFlags."); + cifs_dbg(VFS, "Server requires packet signing to be enabled in /proc/fs/cifs/SecurityFlags\n"); rc = -EOPNOTSUPP; } server->sec_mode &= ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED); } else if ((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) { /* signing required */ - cFYI(1, "Must sign - secFlags 0x%x", secFlags); + cifs_dbg(FYI, "Must sign - secFlags 0x%x\n", secFlags); if ((server->sec_mode & (SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED)) == 0) { - cERROR(1, "signing required but server lacks support"); + cifs_dbg(VFS, "signing required but server lacks support\n"); rc = -EOPNOTSUPP; } else server->sec_mode |= SECMODE_SIGN_REQUIRED; @@ -661,7 +656,7 @@ signing_check: neg_err_exit: cifs_buf_release(pSMB); - cFYI(1, "negprot rc %d", rc); + cifs_dbg(FYI, "negprot rc %d\n", rc); return rc; } @@ -671,7 +666,7 @@ CIFSSMBTDis(const unsigned int xid, struct cifs_tcon *tcon) struct smb_hdr *smb_buffer; int rc = 0; - cFYI(1, "In tree disconnect"); + cifs_dbg(FYI, "In tree disconnect\n"); /* BB: do we need to check this? These should never be NULL. */ if ((tcon->ses == NULL) || (tcon->ses->server == NULL)) @@ -693,7 +688,7 @@ CIFSSMBTDis(const unsigned int xid, struct cifs_tcon *tcon) rc = SendReceiveNoRsp(xid, tcon->ses, (char *)smb_buffer, 0); if (rc) - cFYI(1, "Tree disconnect failed %d", rc); + cifs_dbg(FYI, "Tree disconnect failed %d\n", rc); /* No need to return error on this operation if tid invalidated and closed on server already e.g. due to tcp session crashing */ @@ -728,7 +723,7 @@ CIFSSMBEcho(struct TCP_Server_Info *server) struct smb_rqst rqst = { .rq_iov = &iov, .rq_nvec = 1 }; - cFYI(1, "In echo request"); + cifs_dbg(FYI, "In echo request\n"); rc = small_smb_init(SMB_COM_ECHO, 0, NULL, (void **)&smb); if (rc) @@ -747,7 +742,7 @@ CIFSSMBEcho(struct TCP_Server_Info *server) rc = cifs_call_async(server, &rqst, NULL, cifs_echo_callback, server, CIFS_ASYNC_OP | CIFS_ECHO_OP); if (rc) - cFYI(1, "Echo request failed: %d", rc); + cifs_dbg(FYI, "Echo request failed: %d\n", rc); cifs_small_buf_release(smb); @@ -760,7 +755,7 @@ CIFSSMBLogoff(const unsigned int xid, struct cifs_ses *ses) LOGOFF_ANDX_REQ *pSMB; int rc = 0; - cFYI(1, "In SMBLogoff for session disconnect"); + cifs_dbg(FYI, "In SMBLogoff for session disconnect\n"); /* * BB: do we need to check validity of ses and server? They should @@ -814,7 +809,7 @@ CIFSPOSIXDelFile(const unsigned int xid, struct cifs_tcon *tcon, int bytes_returned = 0; __u16 params, param_offset, offset, byte_count; - cFYI(1, "In POSIX delete"); + cifs_dbg(FYI, "In POSIX delete\n"); PsxDelete: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -866,7 +861,7 @@ PsxDelete: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) - cFYI(1, "Posix delete returned %d", rc); + cifs_dbg(FYI, "Posix delete returned %d\n", rc); cifs_buf_release(pSMB); cifs_stats_inc(&tcon->stats.cifs_stats.num_deletes); @@ -914,7 +909,7 @@ DelFileRetry: (struct smb_hdr *) pSMBr, &bytes_returned, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_deletes); if (rc) - cFYI(1, "Error in RMFile = %d", rc); + cifs_dbg(FYI, "Error in RMFile = %d\n", rc); cifs_buf_release(pSMB); if (rc == -EAGAIN) @@ -934,7 +929,7 @@ CIFSSMBRmDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, int name_len; int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; - cFYI(1, "In CIFSSMBRmDir"); + cifs_dbg(FYI, "In CIFSSMBRmDir\n"); RmDirRetry: rc = smb_init(SMB_COM_DELETE_DIRECTORY, 0, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -960,7 +955,7 @@ RmDirRetry: (struct smb_hdr *) pSMBr, &bytes_returned, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_rmdirs); if (rc) - cFYI(1, "Error in RMDir = %d", rc); + cifs_dbg(FYI, "Error in RMDir = %d\n", rc); cifs_buf_release(pSMB); if (rc == -EAGAIN) @@ -979,7 +974,7 @@ CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, int name_len; int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; - cFYI(1, "In CIFSSMBMkDir"); + cifs_dbg(FYI, "In CIFSSMBMkDir\n"); MkDirRetry: rc = smb_init(SMB_COM_CREATE_DIRECTORY, 0, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -1005,7 +1000,7 @@ MkDirRetry: (struct smb_hdr *) pSMBr, &bytes_returned, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_mkdirs); if (rc) - cFYI(1, "Error in Mkdir = %d", rc); + cifs_dbg(FYI, "Error in Mkdir = %d\n", rc); cifs_buf_release(pSMB); if (rc == -EAGAIN) @@ -1029,7 +1024,7 @@ CIFSPOSIXCreate(const unsigned int xid, struct cifs_tcon *tcon, OPEN_PSX_REQ *pdata; OPEN_PSX_RSP *psx_rsp; - cFYI(1, "In POSIX Create"); + cifs_dbg(FYI, "In POSIX Create\n"); PsxCreat: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -1083,11 +1078,11 @@ PsxCreat: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1, "Posix create returned %d", rc); + cifs_dbg(FYI, "Posix create returned %d\n", rc); goto psx_create_err; } - cFYI(1, "copying inode info"); + cifs_dbg(FYI, "copying inode info\n"); rc = validate_t2((struct smb_t2_rsp *)pSMBr); if (rc || get_bcc(&pSMBr->hdr) < sizeof(OPEN_PSX_RSP)) { @@ -1109,11 +1104,11 @@ PsxCreat: /* check to make sure response data is there */ if (psx_rsp->ReturnedLevel != cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC)) { pRetData->Type = cpu_to_le32(-1); /* unknown */ - cFYI(DBG2, "unknown type"); + cifs_dbg(NOISY, "unknown type\n"); } else { if (get_bcc(&pSMBr->hdr) < sizeof(OPEN_PSX_RSP) + sizeof(FILE_UNIX_BASIC_INFO)) { - cERROR(1, "Open response data too small"); + cifs_dbg(VFS, "Open response data too small\n"); pRetData->Type = cpu_to_le32(-1); goto psx_create_err; } @@ -1160,7 +1155,7 @@ static __u16 convert_disposition(int disposition) ofun = SMBOPEN_OCREATE | SMBOPEN_OTRUNC; break; default: - cFYI(1, "unknown disposition %d", disposition); + cifs_dbg(FYI, "unknown disposition %d\n", disposition); ofun = SMBOPEN_OAPPEND; /* regular open */ } return ofun; @@ -1251,7 +1246,7 @@ OldOpenRetry: (struct smb_hdr *)pSMBr, &bytes_returned, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_opens); if (rc) { - cFYI(1, "Error in Open = %d", rc); + cifs_dbg(FYI, "Error in Open = %d\n", rc); } else { /* BB verify if wct == 15 */ @@ -1364,7 +1359,7 @@ openRetry: (struct smb_hdr *)pSMBr, &bytes_returned, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_opens); if (rc) { - cFYI(1, "Error in Open = %d", rc); + cifs_dbg(FYI, "Error in Open = %d\n", rc); } else { *pOplock = pSMBr->OplockLevel; /* 1 byte no need to le_to_cpu */ *netfid = pSMBr->Fid; /* cifs fid stays in le */ @@ -1425,8 +1420,8 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) char *buf = server->smallbuf; unsigned int buflen = get_rfc1002_length(buf) + 4; - cFYI(1, "%s: mid=%llu offset=%llu bytes=%u", __func__, - mid->mid, rdata->offset, rdata->bytes); + cifs_dbg(FYI, "%s: mid=%llu offset=%llu bytes=%u\n", + __func__, mid->mid, rdata->offset, rdata->bytes); /* * read the rest of READ_RSP header (sans Data array), or whatever we @@ -1447,16 +1442,16 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) /* Was the SMB read successful? */ rdata->result = server->ops->map_error(buf, false); if (rdata->result != 0) { - cFYI(1, "%s: server returned error %d", __func__, - rdata->result); + cifs_dbg(FYI, "%s: server returned error %d\n", + __func__, rdata->result); return cifs_readv_discard(server, mid); } /* Is there enough to get to the rest of the READ_RSP header? */ if (server->total_read < server->vals->read_rsp_size) { - cFYI(1, "%s: server returned short header. got=%u expected=%zu", - __func__, server->total_read, - server->vals->read_rsp_size); + cifs_dbg(FYI, "%s: server returned short header. got=%u expected=%zu\n", + __func__, server->total_read, + server->vals->read_rsp_size); rdata->result = -EIO; return cifs_readv_discard(server, mid); } @@ -1468,19 +1463,19 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) * is beyond the EOF. Treat it as if the data starts just after * the header. */ - cFYI(1, "%s: data offset (%u) inside read response header", - __func__, data_offset); + cifs_dbg(FYI, "%s: data offset (%u) inside read response header\n", + __func__, data_offset); data_offset = server->total_read; } else if (data_offset > MAX_CIFS_SMALL_BUFFER_SIZE) { /* data_offset is beyond the end of smallbuf */ - cFYI(1, "%s: data offset (%u) beyond end of smallbuf", - __func__, data_offset); + cifs_dbg(FYI, "%s: data offset (%u) beyond end of smallbuf\n", + __func__, data_offset); rdata->result = -EIO; return cifs_readv_discard(server, mid); } - cFYI(1, "%s: total_read=%u data_offset=%u", __func__, - server->total_read, data_offset); + cifs_dbg(FYI, "%s: total_read=%u data_offset=%u\n", + __func__, server->total_read, data_offset); len = data_offset - server->total_read; if (len > 0) { @@ -1496,8 +1491,8 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) /* set up first iov for signature check */ rdata->iov.iov_base = buf; rdata->iov.iov_len = server->total_read; - cFYI(1, "0: iov_base=%p iov_len=%zu", - rdata->iov.iov_base, rdata->iov.iov_len); + cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n", + rdata->iov.iov_base, rdata->iov.iov_len); /* how much data is in the response? */ data_len = server->ops->read_data_length(buf); @@ -1514,8 +1509,8 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) server->total_read += length; rdata->bytes = length; - cFYI(1, "total_read=%u buflen=%u remaining=%u", server->total_read, - buflen, data_len); + cifs_dbg(FYI, "total_read=%u buflen=%u remaining=%u\n", + server->total_read, buflen, data_len); /* discard anything left over */ if (server->total_read < buflen) @@ -1538,8 +1533,9 @@ cifs_readv_callback(struct mid_q_entry *mid) .rq_pagesz = rdata->pagesz, .rq_tailsz = rdata->tailsz }; - cFYI(1, "%s: mid=%llu state=%d result=%d bytes=%u", __func__, - mid->mid, mid->mid_state, rdata->result, rdata->bytes); + cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%u\n", + __func__, mid->mid, mid->mid_state, rdata->result, + rdata->bytes); switch (mid->mid_state) { case MID_RESPONSE_RECEIVED: @@ -1549,10 +1545,10 @@ cifs_readv_callback(struct mid_q_entry *mid) int rc = 0; rc = cifs_verify_signature(&rqst, server, - mid->sequence_number + 1); + mid->sequence_number); if (rc) - cERROR(1, "SMB signature verification returned " - "error = %d", rc); + cifs_dbg(VFS, "SMB signature verification returned error = %d\n", + rc); } /* FIXME: should this be counted toward the initiating task? */ task_io_account_read(rdata->bytes); @@ -1582,8 +1578,8 @@ cifs_async_readv(struct cifs_readdata *rdata) struct smb_rqst rqst = { .rq_iov = &rdata->iov, .rq_nvec = 1 }; - cFYI(1, "%s: offset=%llu bytes=%u", __func__, - rdata->offset, rdata->bytes); + cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n", + __func__, rdata->offset, rdata->bytes); if (tcon->ses->capabilities & CAP_LARGE_FILES) wct = 12; @@ -1653,7 +1649,7 @@ CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms, struct cifs_tcon *tcon = io_parms->tcon; unsigned int count = io_parms->length; - cFYI(1, "Reading %d bytes on fid %d", count, netfid); + cifs_dbg(FYI, "Reading %d bytes on fid %d\n", count, netfid); if (tcon->ses->capabilities & CAP_LARGE_FILES) wct = 12; else { @@ -1701,7 +1697,7 @@ CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms, cifs_stats_inc(&tcon->stats.cifs_stats.num_reads); pSMBr = (READ_RSP *)iov[0].iov_base; if (rc) { - cERROR(1, "Send error in read = %d", rc); + cifs_dbg(VFS, "Send error in read = %d\n", rc); } else { int data_length = le16_to_cpu(pSMBr->DataLengthHigh); data_length = data_length << 16; @@ -1711,7 +1707,7 @@ CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms, /*check that DataLength would not go beyond end of SMB */ if ((data_length > CIFSMaxBufSize) || (data_length > count)) { - cFYI(1, "bad length %d for count %d", + cifs_dbg(FYI, "bad length %d for count %d\n", data_length, count); rc = -EIO; *nbytes = 0; @@ -1719,7 +1715,7 @@ CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms, pReadData = (char *) (&pSMBr->hdr.Protocol) + le16_to_cpu(pSMBr->DataOffset); /* if (rc = copy_to_user(buf, pReadData, data_length)) { - cERROR(1, "Faulting on read rc = %d",rc); + cifs_dbg(VFS, "Faulting on read rc = %d\n",rc); rc = -EFAULT; }*/ /* can not use copy_to_user when using page cache*/ if (*buf) @@ -1767,7 +1763,7 @@ CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms, *nbytes = 0; - /* cFYI(1, "write at %lld %d bytes", offset, count);*/ + /* cifs_dbg(FYI, "write at %lld %d bytes\n", offset, count);*/ if (tcon->ses == NULL) return -ECONNABORTED; @@ -1852,7 +1848,7 @@ CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms, (struct smb_hdr *) pSMBr, &bytes_returned, long_op); cifs_stats_inc(&tcon->stats.cifs_stats.num_writes); if (rc) { - cFYI(1, "Send error in write = %d", rc); + cifs_dbg(FYI, "Send error in write = %d\n", rc); } else { *nbytes = le16_to_cpu(pSMBr->CountHigh); *nbytes = (*nbytes) << 16; @@ -1909,12 +1905,12 @@ cifs_writev_requeue(struct cifs_writedata *wdata) } while (rc == -EAGAIN); for (i = 0; i < wdata->nr_pages; i++) { + unlock_page(wdata->pages[i]); if (rc != 0) { SetPageError(wdata->pages[i]); end_page_writeback(wdata->pages[i]); page_cache_release(wdata->pages[i]); } - unlock_page(wdata->pages[i]); } mapping_set_error(inode->i_mapping, rc); @@ -1959,7 +1955,7 @@ cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete) /* this would overflow */ if (nr_pages == 0) { - cERROR(1, "%s: called with nr_pages == 0!", __func__); + cifs_dbg(VFS, "%s: called with nr_pages == 0!\n", __func__); return NULL; } @@ -2075,7 +2071,8 @@ cifs_async_writev(struct cifs_writedata *wdata) rqst.rq_pagesz = wdata->pagesz; rqst.rq_tailsz = wdata->tailsz; - cFYI(1, "async write at %llu %u bytes", wdata->offset, wdata->bytes); + cifs_dbg(FYI, "async write at %llu %u bytes\n", + wdata->offset, wdata->bytes); smb->DataLengthLow = cpu_to_le16(wdata->bytes & 0xFFFF); smb->DataLengthHigh = cpu_to_le16(wdata->bytes >> 16); @@ -2123,7 +2120,7 @@ CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms, *nbytes = 0; - cFYI(1, "write2 at %lld %d bytes", (long long)offset, count); + cifs_dbg(FYI, "write2 at %lld %d bytes\n", (long long)offset, count); if (tcon->ses->capabilities & CAP_LARGE_FILES) { wct = 14; @@ -2182,7 +2179,7 @@ CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms, rc = SendReceive2(xid, tcon->ses, iov, n_vec + 1, &resp_buf_type, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_writes); if (rc) { - cFYI(1, "Send error Write2 = %d", rc); + cifs_dbg(FYI, "Send error Write2 = %d\n", rc); } else if (resp_buf_type == 0) { /* presumably this can not happen, but best to be safe */ rc = -EIO; @@ -2223,7 +2220,8 @@ int cifs_lockv(const unsigned int xid, struct cifs_tcon *tcon, int resp_buf_type; __u16 count; - cFYI(1, "cifs_lockv num lock %d num unlock %d", num_lock, num_unlock); + cifs_dbg(FYI, "cifs_lockv num lock %d num unlock %d\n", + num_lock, num_unlock); rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB); if (rc) @@ -2249,7 +2247,7 @@ int cifs_lockv(const unsigned int xid, struct cifs_tcon *tcon, cifs_stats_inc(&tcon->stats.cifs_stats.num_locks); rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, CIFS_NO_RESP); if (rc) - cFYI(1, "Send error in cifs_lockv = %d", rc); + cifs_dbg(FYI, "Send error in cifs_lockv = %d\n", rc); return rc; } @@ -2268,7 +2266,8 @@ CIFSSMBLock(const unsigned int xid, struct cifs_tcon *tcon, int flags = 0; __u16 count; - cFYI(1, "CIFSSMBLock timeout %d numLock %d", (int)waitFlag, numLock); + cifs_dbg(FYI, "CIFSSMBLock timeout %d numLock %d\n", + (int)waitFlag, numLock); rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB); if (rc) @@ -2317,7 +2316,7 @@ CIFSSMBLock(const unsigned int xid, struct cifs_tcon *tcon, } cifs_stats_inc(&tcon->stats.cifs_stats.num_locks); if (rc) - cFYI(1, "Send error in Lock = %d", rc); + cifs_dbg(FYI, "Send error in Lock = %d\n", rc); /* Note: On -EAGAIN error only caller can retry on handle based calls since file handle passed in no longer valid */ @@ -2341,7 +2340,7 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon, __u16 params, param_offset, offset, byte_count, count; struct kvec iov[1]; - cFYI(1, "Posix Lock"); + cifs_dbg(FYI, "Posix Lock\n"); rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); @@ -2408,7 +2407,7 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon, } if (rc) { - cFYI(1, "Send error in Posix Lock = %d", rc); + cifs_dbg(FYI, "Send error in Posix Lock = %d\n", rc); } else if (pLockData) { /* lock structure can be returned on get */ __u16 data_offset; @@ -2465,7 +2464,7 @@ CIFSSMBClose(const unsigned int xid, struct cifs_tcon *tcon, int smb_file_id) { int rc = 0; CLOSE_REQ *pSMB = NULL; - cFYI(1, "In CIFSSMBClose"); + cifs_dbg(FYI, "In CIFSSMBClose\n"); /* do not retry on dead session on close */ rc = small_smb_init(SMB_COM_CLOSE, 3, tcon, (void **) &pSMB); @@ -2482,7 +2481,7 @@ CIFSSMBClose(const unsigned int xid, struct cifs_tcon *tcon, int smb_file_id) if (rc) { if (rc != -EINTR) { /* EINTR is expected when user ctl-c to kill app */ - cERROR(1, "Send error in Close = %d", rc); + cifs_dbg(VFS, "Send error in Close = %d\n", rc); } } @@ -2498,7 +2497,7 @@ CIFSSMBFlush(const unsigned int xid, struct cifs_tcon *tcon, int smb_file_id) { int rc = 0; FLUSH_REQ *pSMB = NULL; - cFYI(1, "In CIFSSMBFlush"); + cifs_dbg(FYI, "In CIFSSMBFlush\n"); rc = small_smb_init(SMB_COM_FLUSH, 1, tcon, (void **) &pSMB); if (rc) @@ -2509,7 +2508,7 @@ CIFSSMBFlush(const unsigned int xid, struct cifs_tcon *tcon, int smb_file_id) rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_flushes); if (rc) - cERROR(1, "Send error in Flush = %d", rc); + cifs_dbg(VFS, "Send error in Flush = %d\n", rc); return rc; } @@ -2527,7 +2526,7 @@ CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon, __u16 count; int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; - cFYI(1, "In CIFSSMBRename"); + cifs_dbg(FYI, "In CIFSSMBRename\n"); renameRetry: rc = smb_init(SMB_COM_RENAME, 1, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -2574,7 +2573,7 @@ renameRetry: (struct smb_hdr *) pSMBr, &bytes_returned, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_renames); if (rc) - cFYI(1, "Send error in rename = %d", rc); + cifs_dbg(FYI, "Send error in rename = %d\n", rc); cifs_buf_release(pSMB); @@ -2598,7 +2597,7 @@ int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *pTcon, int len_of_str; __u16 params, param_offset, offset, count, byte_count; - cFYI(1, "Rename to File by handle"); + cifs_dbg(FYI, "Rename to File by handle\n"); rc = smb_init(SMB_COM_TRANSACTION2, 15, pTcon, (void **) &pSMB, (void **) &pSMBr); if (rc) @@ -2655,7 +2654,8 @@ int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *pTcon, (struct smb_hdr *) pSMBr, &bytes_returned, 0); cifs_stats_inc(&pTcon->stats.cifs_stats.num_t2renames); if (rc) - cFYI(1, "Send error in Rename (by file handle) = %d", rc); + cifs_dbg(FYI, "Send error in Rename (by file handle) = %d\n", + rc); cifs_buf_release(pSMB); @@ -2677,7 +2677,7 @@ CIFSSMBCopy(const unsigned int xid, struct cifs_tcon *tcon, int name_len, name_len2; __u16 count; - cFYI(1, "In CIFSSMBCopy"); + cifs_dbg(FYI, "In CIFSSMBCopy\n"); copyRetry: rc = smb_init(SMB_COM_COPY, 1, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -2722,8 +2722,8 @@ copyRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1, "Send error in copy = %d with %d files copied", - rc, le16_to_cpu(pSMBr->CopyCount)); + cifs_dbg(FYI, "Send error in copy = %d with %d files copied\n", + rc, le16_to_cpu(pSMBr->CopyCount)); } cifs_buf_release(pSMB); @@ -2747,7 +2747,7 @@ CIFSUnixCreateSymLink(const unsigned int xid, struct cifs_tcon *tcon, int bytes_returned = 0; __u16 params, param_offset, offset, byte_count; - cFYI(1, "In Symlink Unix style"); + cifs_dbg(FYI, "In Symlink Unix style\n"); createSymLinkRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -2812,7 +2812,8 @@ createSymLinkRetry: (struct smb_hdr *) pSMBr, &bytes_returned, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_symlinks); if (rc) - cFYI(1, "Send error in SetPathInfo create symlink = %d", rc); + cifs_dbg(FYI, "Send error in SetPathInfo create symlink = %d\n", + rc); cifs_buf_release(pSMB); @@ -2836,7 +2837,7 @@ CIFSUnixCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon, int bytes_returned = 0; __u16 params, param_offset, offset, byte_count; - cFYI(1, "In Create Hard link Unix style"); + cifs_dbg(FYI, "In Create Hard link Unix style\n"); createHardLinkRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -2898,7 +2899,8 @@ createHardLinkRetry: (struct smb_hdr *) pSMBr, &bytes_returned, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_hardlinks); if (rc) - cFYI(1, "Send error in SetPathInfo (hard link) = %d", rc); + cifs_dbg(FYI, "Send error in SetPathInfo (hard link) = %d\n", + rc); cifs_buf_release(pSMB); if (rc == -EAGAIN) @@ -2920,7 +2922,7 @@ CIFSCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon, __u16 count; int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; - cFYI(1, "In CIFSCreateHardLink"); + cifs_dbg(FYI, "In CIFSCreateHardLink\n"); winCreateHardLinkRetry: rc = smb_init(SMB_COM_NT_RENAME, 4, tcon, (void **) &pSMB, @@ -2972,7 +2974,7 @@ winCreateHardLinkRetry: (struct smb_hdr *) pSMBr, &bytes_returned, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_hardlinks); if (rc) - cFYI(1, "Send error in hard link (NT rename) = %d", rc); + cifs_dbg(FYI, "Send error in hard link (NT rename) = %d\n", rc); cifs_buf_release(pSMB); if (rc == -EAGAIN) @@ -2995,7 +2997,7 @@ CIFSSMBUnixQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, __u16 params, byte_count; char *data_start; - cFYI(1, "In QPathSymLinkInfo (Unix) for path %s", searchName); + cifs_dbg(FYI, "In QPathSymLinkInfo (Unix) for path %s\n", searchName); querySymLinkRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, @@ -3042,7 +3044,7 @@ querySymLinkRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1, "Send error in QuerySymLinkInfo = %d", rc); + cifs_dbg(FYI, "Send error in QuerySymLinkInfo = %d\n", rc); } else { /* decode response */ @@ -3097,7 +3099,8 @@ CIFSSMBQueryReparseLinkInfo(const unsigned int xid, struct cifs_tcon *tcon, struct smb_com_transaction_ioctl_req *pSMB; struct smb_com_transaction_ioctl_rsp *pSMBr; - cFYI(1, "In Windows reparse style QueryLink for path %s", searchName); + cifs_dbg(FYI, "In Windows reparse style QueryLink for path %s\n", + searchName); rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB, (void **) &pSMBr); if (rc) @@ -3125,7 +3128,7 @@ CIFSSMBQueryReparseLinkInfo(const unsigned int xid, struct cifs_tcon *tcon, rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1, "Send error in QueryReparseLinkInfo = %d", rc); + cifs_dbg(FYI, "Send error in QueryReparseLinkInfo = %d\n", rc); } else { /* decode response */ __u32 data_offset = le32_to_cpu(pSMBr->DataOffset); __u32 data_count = le32_to_cpu(pSMBr->DataCount); @@ -3149,7 +3152,7 @@ CIFSSMBQueryReparseLinkInfo(const unsigned int xid, struct cifs_tcon *tcon, if ((reparse_buf->LinkNamesBuf + reparse_buf->TargetNameOffset + reparse_buf->TargetNameLen) > end_of_smb) { - cFYI(1, "reparse buf beyond SMB"); + cifs_dbg(FYI, "reparse buf beyond SMB\n"); rc = -EIO; goto qreparse_out; } @@ -3170,12 +3173,11 @@ CIFSSMBQueryReparseLinkInfo(const unsigned int xid, struct cifs_tcon *tcon, } } else { rc = -EIO; - cFYI(1, "Invalid return data count on " - "get reparse info ioctl"); + cifs_dbg(FYI, "Invalid return data count on get reparse info ioctl\n"); } symlinkinfo[buflen] = 0; /* just in case so the caller does not go off the end of the buffer */ - cFYI(1, "readlink result - %s", symlinkinfo); + cifs_dbg(FYI, "readlink result - %s\n", symlinkinfo); } qreparse_out: @@ -3198,7 +3200,10 @@ static void cifs_convert_ace(posix_acl_xattr_entry *ace, ace->e_perm = cpu_to_le16(cifs_ace->cifs_e_perm); ace->e_tag = cpu_to_le16(cifs_ace->cifs_e_tag); ace->e_id = cpu_to_le32(le64_to_cpu(cifs_ace->cifs_uid)); - /* cFYI(1, "perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id); */ +/* + cifs_dbg(FYI, "perm %d tag %d id %d\n", + ace->e_perm, ace->e_tag, ace->e_id); +*/ return; } @@ -3224,8 +3229,8 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen, size += sizeof(struct cifs_posix_ace) * count; /* check if we would go beyond end of SMB */ if (size_of_data_area < size) { - cFYI(1, "bad CIFS POSIX ACL size %d vs. %d", - size_of_data_area, size); + cifs_dbg(FYI, "bad CIFS POSIX ACL size %d vs. %d\n", + size_of_data_area, size); return -EINVAL; } } else if (acl_type & ACL_TYPE_DEFAULT) { @@ -3272,7 +3277,10 @@ static __u16 convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace, cifs_ace->cifs_uid = cpu_to_le64(-1); } else cifs_ace->cifs_uid = cpu_to_le64(le32_to_cpu(local_ace->e_id)); - /*cFYI(1, "perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id);*/ +/* + cifs_dbg(FYI, "perm %d tag %d id %d\n", + ace->e_perm, ace->e_tag, ace->e_id); +*/ return rc; } @@ -3290,12 +3298,11 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL, return 0; count = posix_acl_xattr_count((size_t)buflen); - cFYI(1, "setting acl with %d entries from buf of length %d and " - "version of %d", - count, buflen, le32_to_cpu(local_acl->a_version)); + cifs_dbg(FYI, "setting acl with %d entries from buf of length %d and version of %d\n", + count, buflen, le32_to_cpu(local_acl->a_version)); if (le32_to_cpu(local_acl->a_version) != 2) { - cFYI(1, "unknown POSIX ACL version %d", - le32_to_cpu(local_acl->a_version)); + cifs_dbg(FYI, "unknown POSIX ACL version %d\n", + le32_to_cpu(local_acl->a_version)); return 0; } cifs_acl->version = cpu_to_le16(1); @@ -3304,7 +3311,7 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL, else if (acl_type == ACL_TYPE_DEFAULT) cifs_acl->default_entry_count = cpu_to_le16(count); else { - cFYI(1, "unknown ACL type %d", acl_type); + cifs_dbg(FYI, "unknown ACL type %d\n", acl_type); return 0; } for (i = 0; i < count; i++) { @@ -3337,7 +3344,7 @@ CIFSSMBGetPosixACL(const unsigned int xid, struct cifs_tcon *tcon, int name_len; __u16 params, byte_count; - cFYI(1, "In GetPosixACL (Unix) for path %s", searchName); + cifs_dbg(FYI, "In GetPosixACL (Unix) for path %s\n", searchName); queryAclRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, @@ -3390,7 +3397,7 @@ queryAclRetry: (struct smb_hdr *) pSMBr, &bytes_returned, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_acl_get); if (rc) { - cFYI(1, "Send error in Query POSIX ACL = %d", rc); + cifs_dbg(FYI, "Send error in Query POSIX ACL = %d\n", rc); } else { /* decode response */ @@ -3427,7 +3434,7 @@ CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon, int bytes_returned = 0; __u16 params, byte_count, data_count, param_offset, offset; - cFYI(1, "In SetPosixACL (Unix) for path %s", fileName); + cifs_dbg(FYI, "In SetPosixACL (Unix) for path %s\n", fileName); setAclRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -3482,7 +3489,7 @@ setAclRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) - cFYI(1, "Set POSIX ACL returned %d", rc); + cifs_dbg(FYI, "Set POSIX ACL returned %d\n", rc); setACLerrorExit: cifs_buf_release(pSMB); @@ -3502,7 +3509,7 @@ CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon, int bytes_returned; __u16 params, byte_count; - cFYI(1, "In GetExtAttr"); + cifs_dbg(FYI, "In GetExtAttr\n"); if (tcon == NULL) return -ENODEV; @@ -3541,7 +3548,7 @@ GetExtAttrRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1, "error %d in GetExtAttr", rc); + cifs_dbg(FYI, "error %d in GetExtAttr\n", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -3556,7 +3563,7 @@ GetExtAttrRetry: struct file_chattr_info *pfinfo; /* BB Do we need a cast or hash here ? */ if (count != 16) { - cFYI(1, "Illegal size ret in GetExtAttr"); + cifs_dbg(FYI, "Illegal size ret in GetExtAttr\n"); rc = -EIO; goto GetExtAttrOut; } @@ -3644,21 +3651,21 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata, /* should we also check that parm and data areas do not overlap? */ if (*ppparm > end_of_smb) { - cFYI(1, "parms start after end of smb"); + cifs_dbg(FYI, "parms start after end of smb\n"); return -EINVAL; } else if (parm_count + *ppparm > end_of_smb) { - cFYI(1, "parm end after end of smb"); + cifs_dbg(FYI, "parm end after end of smb\n"); return -EINVAL; } else if (*ppdata > end_of_smb) { - cFYI(1, "data starts after end of smb"); + cifs_dbg(FYI, "data starts after end of smb\n"); return -EINVAL; } else if (data_count + *ppdata > end_of_smb) { - cFYI(1, "data %p + count %d (%p) past smb end %p start %p", - *ppdata, data_count, (data_count + *ppdata), - end_of_smb, pSMBr); + cifs_dbg(FYI, "data %p + count %d (%p) past smb end %p start %p\n", + *ppdata, data_count, (data_count + *ppdata), + end_of_smb, pSMBr); return -EINVAL; } else if (parm_count + data_count > bcc) { - cFYI(1, "parm count and data count larger than SMB"); + cifs_dbg(FYI, "parm count and data count larger than SMB\n"); return -EINVAL; } *pdatalen = data_count; @@ -3676,7 +3683,7 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, QUERY_SEC_DESC_REQ *pSMB; struct kvec iov[1]; - cFYI(1, "GetCifsACL"); + cifs_dbg(FYI, "GetCifsACL\n"); *pbuflen = 0; *acl_inf = NULL; @@ -3701,7 +3708,7 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_acl_get); if (rc) { - cFYI(1, "Send error in QuerySecDesc = %d", rc); + cifs_dbg(FYI, "Send error in QuerySecDesc = %d\n", rc); } else { /* decode response */ __le32 *parm; __u32 parm_len; @@ -3716,7 +3723,8 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, goto qsec_out; pSMBr = (struct smb_com_ntransact_rsp *)iov[0].iov_base; - cFYI(1, "smb %p parm %p data %p", pSMBr, parm, *acl_inf); + cifs_dbg(FYI, "smb %p parm %p data %p\n", + pSMBr, parm, *acl_inf); if (le32_to_cpu(pSMBr->ParameterCount) != 4) { rc = -EIO; /* bad smb */ @@ -3728,8 +3736,8 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, acl_len = le32_to_cpu(*parm); if (acl_len != *pbuflen) { - cERROR(1, "acl length %d does not match %d", - acl_len, *pbuflen); + cifs_dbg(VFS, "acl length %d does not match %d\n", + acl_len, *pbuflen); if (*pbuflen > acl_len) *pbuflen = acl_len; } @@ -3738,16 +3746,15 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, header followed by the smallest SID */ if ((*pbuflen < sizeof(struct cifs_ntsd) + 8) || (*pbuflen >= 64 * 1024)) { - cERROR(1, "bad acl length %d", *pbuflen); + cifs_dbg(VFS, "bad acl length %d\n", *pbuflen); rc = -EINVAL; *pbuflen = 0; } else { - *acl_inf = kmalloc(*pbuflen, GFP_KERNEL); + *acl_inf = kmemdup(pdata, *pbuflen, GFP_KERNEL); if (*acl_inf == NULL) { *pbuflen = 0; rc = -ENOMEM; } - memcpy(*acl_inf, pdata, *pbuflen); } } qsec_out: @@ -3809,9 +3816,10 @@ setCifsAclRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); - cFYI(1, "SetCIFSACL bytes_returned: %d, rc: %d", bytes_returned, rc); + cifs_dbg(FYI, "SetCIFSACL bytes_returned: %d, rc: %d\n", + bytes_returned, rc); if (rc) - cFYI(1, "Set CIFS ACL returned %d", rc); + cifs_dbg(FYI, "Set CIFS ACL returned %d\n", rc); cifs_buf_release(pSMB); if (rc == -EAGAIN) @@ -3835,7 +3843,7 @@ SMBQueryInformation(const unsigned int xid, struct cifs_tcon *tcon, int bytes_returned; int name_len; - cFYI(1, "In SMBQPath path %s", search_name); + cifs_dbg(FYI, "In SMBQPath path %s\n", search_name); QInfRetry: rc = smb_init(SMB_COM_QUERY_INFORMATION, 0, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -3862,7 +3870,7 @@ QInfRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1, "Send error in QueryInfo = %d", rc); + cifs_dbg(FYI, "Send error in QueryInfo = %d\n", rc); } else if (data) { struct timespec ts; __u32 time = le32_to_cpu(pSMBr->last_write_time); @@ -3936,7 +3944,7 @@ QFileInfoRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1, "Send error in QPathInfo = %d", rc); + cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -3973,7 +3981,7 @@ CIFSSMBQPathInfo(const unsigned int xid, struct cifs_tcon *tcon, int name_len; __u16 params, byte_count; - /* cFYI(1, "In QPathInfo path %s", search_name); */ + /* cifs_dbg(FYI, "In QPathInfo path %s\n", search_name); */ QPathInfoRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -4023,7 +4031,7 @@ QPathInfoRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1, "Send error in QPathInfo = %d", rc); + cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -4104,14 +4112,12 @@ UnixQFileInfoRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1, "Send error in QPathInfo = %d", rc); + cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); if (rc || get_bcc(&pSMBr->hdr) < sizeof(FILE_UNIX_BASIC_INFO)) { - cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response. " - "Unix Extensions can be disabled on mount " - "by specifying the nosfu mount option."); + cifs_dbg(VFS, "Malformed FILE_UNIX_BASIC_INFO response. Unix Extensions can be disabled on mount by specifying the nosfu mount option.\n"); rc = -EIO; /* bad smb */ } else { __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); @@ -4143,7 +4149,7 @@ CIFSSMBUnixQPathInfo(const unsigned int xid, struct cifs_tcon *tcon, int name_len; __u16 params, byte_count; - cFYI(1, "In QPathInfo (Unix) the path %s", searchName); + cifs_dbg(FYI, "In QPathInfo (Unix) the path %s\n", searchName); UnixQPathInfoRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -4190,14 +4196,12 @@ UnixQPathInfoRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1, "Send error in QPathInfo = %d", rc); + cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); if (rc || get_bcc(&pSMBr->hdr) < sizeof(FILE_UNIX_BASIC_INFO)) { - cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response. " - "Unix Extensions can be disabled on mount " - "by specifying the nosfu mount option."); + cifs_dbg(VFS, "Malformed FILE_UNIX_BASIC_INFO response. Unix Extensions can be disabled on mount by specifying the nosfu mount option.\n"); rc = -EIO; /* bad smb */ } else { __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); @@ -4231,7 +4235,7 @@ CIFSFindFirst(const unsigned int xid, struct cifs_tcon *tcon, __u16 params, byte_count; struct nls_table *nls_codepage; - cFYI(1, "In FindFirst for %s", searchName); + cifs_dbg(FYI, "In FindFirst for %s\n", searchName); findFirstRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, @@ -4314,7 +4318,7 @@ findFirstRetry: if (rc) {/* BB add logic to retry regular search if Unix search rejected unexpectedly by server */ /* BB Add code to handle unsupported level rc */ - cFYI(1, "Error in FindFirst = %d", rc); + cifs_dbg(FYI, "Error in FindFirst = %d\n", rc); cifs_buf_release(pSMB); @@ -4352,7 +4356,7 @@ findFirstRetry: psrch_inf->entries_in_buffer; lnoff = le16_to_cpu(parms->LastNameOffset); if (CIFSMaxBufSize < lnoff) { - cERROR(1, "ignoring corrupt resume name"); + cifs_dbg(VFS, "ignoring corrupt resume name\n"); psrch_inf->last_entry = NULL; return rc; } @@ -4383,7 +4387,7 @@ int CIFSFindNext(const unsigned int xid, struct cifs_tcon *tcon, unsigned int name_len; __u16 params, byte_count; - cFYI(1, "In FindNext"); + cifs_dbg(FYI, "In FindNext\n"); if (psrch_inf->endOfSearch) return -ENOENT; @@ -4444,7 +4448,7 @@ int CIFSFindNext(const unsigned int xid, struct cifs_tcon *tcon, cifs_buf_release(pSMB); rc = 0; /* search probably was closed at end of search*/ } else - cFYI(1, "FindNext returned = %d", rc); + cifs_dbg(FYI, "FindNext returned = %d\n", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -4479,15 +4483,15 @@ int CIFSFindNext(const unsigned int xid, struct cifs_tcon *tcon, psrch_inf->entries_in_buffer; lnoff = le16_to_cpu(parms->LastNameOffset); if (CIFSMaxBufSize < lnoff) { - cERROR(1, "ignoring corrupt resume name"); + cifs_dbg(VFS, "ignoring corrupt resume name\n"); psrch_inf->last_entry = NULL; return rc; } else psrch_inf->last_entry = psrch_inf->srch_entries_start + lnoff; -/* cFYI(1, "fnxt2 entries in buf %d index_of_last %d", - psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry); */ +/* cifs_dbg(FYI, "fnxt2 entries in buf %d index_of_last %d\n", + psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry); */ /* BB fixme add unlock here */ } @@ -4512,7 +4516,7 @@ CIFSFindClose(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; FINDCLOSE_REQ *pSMB = NULL; - cFYI(1, "In CIFSSMBFindClose"); + cifs_dbg(FYI, "In CIFSSMBFindClose\n"); rc = small_smb_init(SMB_COM_FIND_CLOSE2, 1, tcon, (void **)&pSMB); /* no sense returning error if session restarted @@ -4526,7 +4530,7 @@ CIFSFindClose(const unsigned int xid, struct cifs_tcon *tcon, pSMB->ByteCount = 0; rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0); if (rc) - cERROR(1, "Send error in FindClose = %d", rc); + cifs_dbg(VFS, "Send error in FindClose = %d\n", rc); cifs_stats_inc(&tcon->stats.cifs_stats.num_fclose); @@ -4548,7 +4552,7 @@ CIFSGetSrvInodeNumber(const unsigned int xid, struct cifs_tcon *tcon, int name_len, bytes_returned; __u16 params, byte_count; - cFYI(1, "In GetSrvInodeNum for %s", search_name); + cifs_dbg(FYI, "In GetSrvInodeNum for %s\n", search_name); if (tcon == NULL) return -ENODEV; @@ -4599,7 +4603,7 @@ GetInodeNumberRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1, "error %d in QueryInternalInfo", rc); + cifs_dbg(FYI, "error %d in QueryInternalInfo\n", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -4614,7 +4618,7 @@ GetInodeNumberRetry: struct file_internal_info *pfinfo; /* BB Do we need a cast or hash here ? */ if (count < 8) { - cFYI(1, "Illegal size ret in QryIntrnlInf"); + cifs_dbg(FYI, "Illegal size ret in QryIntrnlInf\n"); rc = -EIO; goto GetInodeNumOut; } @@ -4655,16 +4659,16 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr, *num_of_nodes = le16_to_cpu(pSMBr->NumberOfReferrals); if (*num_of_nodes < 1) { - cERROR(1, "num_referrals: must be at least > 0," - "but we get num_referrals = %d", *num_of_nodes); + cifs_dbg(VFS, "num_referrals: must be at least > 0, but we get num_referrals = %d\n", + *num_of_nodes); rc = -EINVAL; goto parse_DFS_referrals_exit; } ref = (struct dfs_referral_level_3 *) &(pSMBr->referrals); if (ref->VersionNumber != cpu_to_le16(3)) { - cERROR(1, "Referrals of V%d version are not supported," - "should be V3", le16_to_cpu(ref->VersionNumber)); + cifs_dbg(VFS, "Referrals of V%d version are not supported, should be V3\n", + le16_to_cpu(ref->VersionNumber)); rc = -EINVAL; goto parse_DFS_referrals_exit; } @@ -4673,14 +4677,12 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr, data_end = (char *)(&(pSMBr->PathConsumed)) + le16_to_cpu(pSMBr->t2.DataCount); - cFYI(1, "num_referrals: %d dfs flags: 0x%x ...", - *num_of_nodes, - le32_to_cpu(pSMBr->DFSFlags)); + cifs_dbg(FYI, "num_referrals: %d dfs flags: 0x%x ...\n", + *num_of_nodes, le32_to_cpu(pSMBr->DFSFlags)); - *target_nodes = kzalloc(sizeof(struct dfs_info3_param) * - *num_of_nodes, GFP_KERNEL); + *target_nodes = kcalloc(*num_of_nodes, sizeof(struct dfs_info3_param), + GFP_KERNEL); if (*target_nodes == NULL) { - cERROR(1, "Failed to allocate buffer for target_nodes"); rc = -ENOMEM; goto parse_DFS_referrals_exit; } @@ -4759,7 +4761,7 @@ CIFSGetDFSRefer(const unsigned int xid, struct cifs_ses *ses, *num_of_nodes = 0; *target_nodes = NULL; - cFYI(1, "In GetDFSRefer the path %s", search_name); + cifs_dbg(FYI, "In GetDFSRefer the path %s\n", search_name); if (ses == NULL) return -ENODEV; getDFSRetry: @@ -4827,7 +4829,7 @@ getDFSRetry: rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1, "Send error in GetDFSRefer = %d", rc); + cifs_dbg(FYI, "Send error in GetDFSRefer = %d\n", rc); goto GetDFSRefExit; } rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -4838,9 +4840,8 @@ getDFSRetry: goto GetDFSRefExit; } - cFYI(1, "Decoding GetDFSRefer response BCC: %d Offset %d", - get_bcc(&pSMBr->hdr), - le16_to_cpu(pSMBr->t2.DataOffset)); + cifs_dbg(FYI, "Decoding GetDFSRefer response BCC: %d Offset %d\n", + get_bcc(&pSMBr->hdr), le16_to_cpu(pSMBr->t2.DataOffset)); /* parse returned result into more usable form */ rc = parse_DFS_referrals(pSMBr, num_of_nodes, @@ -4869,7 +4870,7 @@ SMBOldQFSInfo(const unsigned int xid, struct cifs_tcon *tcon, int bytes_returned = 0; __u16 params, byte_count; - cFYI(1, "OldQFSInfo"); + cifs_dbg(FYI, "OldQFSInfo\n"); oldQFSInfoRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -4902,7 +4903,7 @@ oldQFSInfoRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1, "Send error in QFSInfo = %d", rc); + cifs_dbg(FYI, "Send error in QFSInfo = %d\n", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -4910,7 +4911,7 @@ oldQFSInfoRetry: rc = -EIO; /* bad smb */ else { __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); - cFYI(1, "qfsinf resp BCC: %d Offset %d", + cifs_dbg(FYI, "qfsinf resp BCC: %d Offset %d\n", get_bcc(&pSMBr->hdr), data_offset); response_data = (FILE_SYSTEM_ALLOC_INFO *) @@ -4923,10 +4924,10 @@ oldQFSInfoRetry: le32_to_cpu(response_data->TotalAllocationUnits); FSData->f_bfree = FSData->f_bavail = le32_to_cpu(response_data->FreeAllocationUnits); - cFYI(1, "Blocks: %lld Free: %lld Block size %ld", - (unsigned long long)FSData->f_blocks, - (unsigned long long)FSData->f_bfree, - FSData->f_bsize); + cifs_dbg(FYI, "Blocks: %lld Free: %lld Block size %ld\n", + (unsigned long long)FSData->f_blocks, + (unsigned long long)FSData->f_bfree, + FSData->f_bsize); } } cifs_buf_release(pSMB); @@ -4949,7 +4950,7 @@ CIFSSMBQFSInfo(const unsigned int xid, struct cifs_tcon *tcon, int bytes_returned = 0; __u16 params, byte_count; - cFYI(1, "In QFSInfo"); + cifs_dbg(FYI, "In QFSInfo\n"); QFSInfoRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -4982,7 +4983,7 @@ QFSInfoRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1, "Send error in QFSInfo = %d", rc); + cifs_dbg(FYI, "Send error in QFSInfo = %d\n", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -5003,10 +5004,10 @@ QFSInfoRetry: le64_to_cpu(response_data->TotalAllocationUnits); FSData->f_bfree = FSData->f_bavail = le64_to_cpu(response_data->FreeAllocationUnits); - cFYI(1, "Blocks: %lld Free: %lld Block size %ld", - (unsigned long long)FSData->f_blocks, - (unsigned long long)FSData->f_bfree, - FSData->f_bsize); + cifs_dbg(FYI, "Blocks: %lld Free: %lld Block size %ld\n", + (unsigned long long)FSData->f_blocks, + (unsigned long long)FSData->f_bfree, + FSData->f_bsize); } } cifs_buf_release(pSMB); @@ -5028,7 +5029,7 @@ CIFSSMBQFSAttributeInfo(const unsigned int xid, struct cifs_tcon *tcon) int bytes_returned = 0; __u16 params, byte_count; - cFYI(1, "In QFSAttributeInfo"); + cifs_dbg(FYI, "In QFSAttributeInfo\n"); QFSAttributeRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -5062,7 +5063,7 @@ QFSAttributeRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cERROR(1, "Send error in QFSAttributeInfo = %d", rc); + cifs_dbg(VFS, "Send error in QFSAttributeInfo = %d\n", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -5098,7 +5099,7 @@ CIFSSMBQFSDeviceInfo(const unsigned int xid, struct cifs_tcon *tcon) int bytes_returned = 0; __u16 params, byte_count; - cFYI(1, "In QFSDeviceInfo"); + cifs_dbg(FYI, "In QFSDeviceInfo\n"); QFSDeviceRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -5133,7 +5134,7 @@ QFSDeviceRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1, "Send error in QFSDeviceInfo = %d", rc); + cifs_dbg(FYI, "Send error in QFSDeviceInfo = %d\n", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -5169,7 +5170,7 @@ CIFSSMBQFSUnixInfo(const unsigned int xid, struct cifs_tcon *tcon) int bytes_returned = 0; __u16 params, byte_count; - cFYI(1, "In QFSUnixInfo"); + cifs_dbg(FYI, "In QFSUnixInfo\n"); QFSUnixRetry: rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -5203,7 +5204,7 @@ QFSUnixRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cERROR(1, "Send error in QFSUnixInfo = %d", rc); + cifs_dbg(VFS, "Send error in QFSUnixInfo = %d\n", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -5238,7 +5239,7 @@ CIFSSMBSetFSUnixInfo(const unsigned int xid, struct cifs_tcon *tcon, __u64 cap) int bytes_returned = 0; __u16 params, param_offset, offset, byte_count; - cFYI(1, "In SETFSUnixInfo"); + cifs_dbg(FYI, "In SETFSUnixInfo\n"); SETFSUnixRetry: /* BB switch to small buf init to save memory */ rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon, @@ -5286,7 +5287,7 @@ SETFSUnixRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cERROR(1, "Send error in SETFSUnixInfo = %d", rc); + cifs_dbg(VFS, "Send error in SETFSUnixInfo = %d\n", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); if (rc) @@ -5314,7 +5315,7 @@ CIFSSMBQFSPosixInfo(const unsigned int xid, struct cifs_tcon *tcon, int bytes_returned = 0; __u16 params, byte_count; - cFYI(1, "In QFSPosixInfo"); + cifs_dbg(FYI, "In QFSPosixInfo\n"); QFSPosixRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -5348,7 +5349,7 @@ QFSPosixRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1, "Send error in QFSUnixInfo = %d", rc); + cifs_dbg(FYI, "Send error in QFSUnixInfo = %d\n", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -5410,7 +5411,7 @@ CIFSSMBSetEOF(const unsigned int xid, struct cifs_tcon *tcon, __u16 params, byte_count, data_count, param_offset, offset; - cFYI(1, "In SetEOF"); + cifs_dbg(FYI, "In SetEOF\n"); SetEOFRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -5476,7 +5477,7 @@ SetEOFRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) - cFYI(1, "SetPathInfo (file size) returned %d", rc); + cifs_dbg(FYI, "SetPathInfo (file size) returned %d\n", rc); cifs_buf_release(pSMB); @@ -5495,8 +5496,8 @@ CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; __u16 params, param_offset, offset, byte_count, count; - cFYI(1, "SetFileSize (via SetFileInfo) %lld", - (long long)size); + cifs_dbg(FYI, "SetFileSize (via SetFileInfo) %lld\n", + (long long)size); rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); if (rc) @@ -5553,7 +5554,8 @@ CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon, pSMB->ByteCount = cpu_to_le16(byte_count); rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0); if (rc) { - cFYI(1, "Send error in SetFileInfo (SetFileSize) = %d", rc); + cifs_dbg(FYI, "Send error in SetFileInfo (SetFileSize) = %d\n", + rc); } /* Note: On -EAGAIN error only caller can retry on handle based calls @@ -5577,7 +5579,7 @@ CIFSSMBSetFileInfo(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; __u16 params, param_offset, offset, byte_count, count; - cFYI(1, "Set Times (via SetFileInfo)"); + cifs_dbg(FYI, "Set Times (via SetFileInfo)\n"); rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); if (rc) @@ -5623,7 +5625,8 @@ CIFSSMBSetFileInfo(const unsigned int xid, struct cifs_tcon *tcon, memcpy(data_offset, data, sizeof(FILE_BASIC_INFO)); rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0); if (rc) - cFYI(1, "Send error in Set Time (SetFileInfo) = %d", rc); + cifs_dbg(FYI, "Send error in Set Time (SetFileInfo) = %d\n", + rc); /* Note: On -EAGAIN error only caller can retry on handle based calls since file handle passed in no longer valid */ @@ -5640,7 +5643,7 @@ CIFSSMBSetFileDisposition(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; __u16 params, param_offset, offset, byte_count, count; - cFYI(1, "Set File Disposition (via SetFileInfo)"); + cifs_dbg(FYI, "Set File Disposition (via SetFileInfo)\n"); rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); if (rc) @@ -5682,7 +5685,7 @@ CIFSSMBSetFileDisposition(const unsigned int xid, struct cifs_tcon *tcon, *data_offset = delete_file ? 1 : 0; rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0); if (rc) - cFYI(1, "Send error in SetFileDisposition = %d", rc); + cifs_dbg(FYI, "Send error in SetFileDisposition = %d\n", rc); return rc; } @@ -5700,7 +5703,7 @@ CIFSSMBSetPathInfo(const unsigned int xid, struct cifs_tcon *tcon, char *data_offset; __u16 params, param_offset, offset, byte_count, count; - cFYI(1, "In SetTimes"); + cifs_dbg(FYI, "In SetTimes\n"); SetTimesRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, @@ -5756,7 +5759,7 @@ SetTimesRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) - cFYI(1, "SetPathInfo (times) returned %d", rc); + cifs_dbg(FYI, "SetPathInfo (times) returned %d\n", rc); cifs_buf_release(pSMB); @@ -5781,7 +5784,7 @@ CIFSSMBSetAttrLegacy(unsigned int xid, struct cifs_tcon *tcon, char *fileName, int bytes_returned; int name_len; - cFYI(1, "In SetAttrLegacy"); + cifs_dbg(FYI, "In SetAttrLegacy\n"); SetAttrLgcyRetry: rc = smb_init(SMB_COM_SETATTR, 8, tcon, (void **) &pSMB, @@ -5807,7 +5810,7 @@ SetAttrLgcyRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) - cFYI(1, "Error in LegacySetAttr = %d", rc); + cifs_dbg(FYI, "Error in LegacySetAttr = %d\n", rc); cifs_buf_release(pSMB); @@ -5875,7 +5878,7 @@ CIFSSMBUnixSetFileInfo(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; u16 params, param_offset, offset, byte_count, count; - cFYI(1, "Set Unix Info (via SetFileInfo)"); + cifs_dbg(FYI, "Set Unix Info (via SetFileInfo)\n"); rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); if (rc) @@ -5921,7 +5924,8 @@ CIFSSMBUnixSetFileInfo(const unsigned int xid, struct cifs_tcon *tcon, rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0); if (rc) - cFYI(1, "Send error in Set Time (SetFileInfo) = %d", rc); + cifs_dbg(FYI, "Send error in Set Time (SetFileInfo) = %d\n", + rc); /* Note: On -EAGAIN error only caller can retry on handle based calls since file handle passed in no longer valid */ @@ -5943,7 +5947,7 @@ CIFSSMBUnixSetPathInfo(const unsigned int xid, struct cifs_tcon *tcon, FILE_UNIX_BASIC_INFO *data_offset; __u16 params, param_offset, offset, count, byte_count; - cFYI(1, "In SetUID/GID/Mode"); + cifs_dbg(FYI, "In SetUID/GID/Mode\n"); setPermsRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -5999,7 +6003,7 @@ setPermsRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) - cFYI(1, "SetPathInfo (perms) returned %d", rc); + cifs_dbg(FYI, "SetPathInfo (perms) returned %d\n", rc); cifs_buf_release(pSMB); if (rc == -EAGAIN) @@ -6036,7 +6040,7 @@ CIFSSMBQAllEAs(const unsigned int xid, struct cifs_tcon *tcon, __u16 params, byte_count, data_offset; unsigned int ea_name_len = ea_name ? strlen(ea_name) : 0; - cFYI(1, "In Query All EAs path %s", searchName); + cifs_dbg(FYI, "In Query All EAs path %s\n", searchName); QAllEAsRetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -6083,7 +6087,7 @@ QAllEAsRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cFYI(1, "Send error in QueryAllEAs = %d", rc); + cifs_dbg(FYI, "Send error in QueryAllEAs = %d\n", rc); goto QAllEAsOut; } @@ -6111,16 +6115,16 @@ QAllEAsRetry: (((char *) &pSMBr->hdr.Protocol) + data_offset); list_len = le32_to_cpu(ea_response_data->list_len); - cFYI(1, "ea length %d", list_len); + cifs_dbg(FYI, "ea length %d\n", list_len); if (list_len <= 8) { - cFYI(1, "empty EA list returned from server"); + cifs_dbg(FYI, "empty EA list returned from server\n"); goto QAllEAsOut; } /* make sure list_len doesn't go past end of SMB */ end_of_smb = (char *)pByteArea(&pSMBr->hdr) + get_bcc(&pSMBr->hdr); if ((char *)ea_response_data + list_len > end_of_smb) { - cFYI(1, "EA list appears to go beyond SMB"); + cifs_dbg(FYI, "EA list appears to go beyond SMB\n"); rc = -EIO; goto QAllEAsOut; } @@ -6137,7 +6141,7 @@ QAllEAsRetry: temp_ptr += 4; /* make sure we can read name_len and value_len */ if (list_len < 0) { - cFYI(1, "EA entry goes beyond length of list"); + cifs_dbg(FYI, "EA entry goes beyond length of list\n"); rc = -EIO; goto QAllEAsOut; } @@ -6146,7 +6150,7 @@ QAllEAsRetry: value_len = le16_to_cpu(temp_fea->value_len); list_len -= name_len + 1 + value_len; if (list_len < 0) { - cFYI(1, "EA entry goes beyond length of list"); + cifs_dbg(FYI, "EA entry goes beyond length of list\n"); rc = -EIO; goto QAllEAsOut; } @@ -6214,7 +6218,7 @@ CIFSSMBSetEA(const unsigned int xid, struct cifs_tcon *tcon, int bytes_returned = 0; __u16 params, param_offset, byte_count, offset, count; - cFYI(1, "In SetEA"); + cifs_dbg(FYI, "In SetEA\n"); SetEARetry: rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); @@ -6296,7 +6300,7 @@ SetEARetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) - cFYI(1, "SetPathInfo (EA) returned %d", rc); + cifs_dbg(FYI, "SetPathInfo (EA) returned %d\n", rc); cifs_buf_release(pSMB); @@ -6339,7 +6343,7 @@ int CIFSSMBNotify(const unsigned int xid, struct cifs_tcon *tcon, struct dir_notify_req *dnotify_req; int bytes_returned; - cFYI(1, "In CIFSSMBNotify for file handle %d", (int)netfid); + cifs_dbg(FYI, "In CIFSSMBNotify for file handle %d\n", (int)netfid); rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB, (void **) &pSMBr); if (rc) @@ -6368,7 +6372,7 @@ int CIFSSMBNotify(const unsigned int xid, struct cifs_tcon *tcon, (struct smb_hdr *)pSMBr, &bytes_returned, CIFS_ASYNC_OP); if (rc) { - cFYI(1, "Error in Notify = %d", rc); + cifs_dbg(FYI, "Error in Notify = %d\n", rc); } else { /* Add file to outstanding requests */ /* BB change to kmem cache alloc */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 54125e04fd0c..99eeaa17ee00 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -95,9 +95,7 @@ enum { /* Mount options which take string value */ Opt_user, Opt_pass, Opt_ip, - Opt_unc, Opt_domain, - Opt_srcaddr, Opt_prefixpath, - Opt_iocharset, Opt_sockopt, + Opt_domain, Opt_srcaddr, Opt_iocharset, Opt_netbiosname, Opt_servern, Opt_ver, Opt_vers, Opt_sec, Opt_cache, @@ -193,16 +191,15 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_blank_ip, "addr=" }, { Opt_ip, "ip=%s" }, { Opt_ip, "addr=%s" }, - { Opt_unc, "unc=%s" }, - { Opt_unc, "target=%s" }, - { Opt_unc, "path=%s" }, + { Opt_ignore, "unc=%s" }, + { Opt_ignore, "target=%s" }, + { Opt_ignore, "path=%s" }, { Opt_domain, "dom=%s" }, { Opt_domain, "domain=%s" }, { Opt_domain, "workgroup=%s" }, { Opt_srcaddr, "srcaddr=%s" }, - { Opt_prefixpath, "prefixpath=%s" }, + { Opt_ignore, "prefixpath=%s" }, { Opt_iocharset, "iocharset=%s" }, - { Opt_sockopt, "sockopt=%s" }, { Opt_netbiosname, "netbiosname=%s" }, { Opt_servern, "servern=%s" }, { Opt_ver, "ver=%s" }, @@ -319,11 +316,12 @@ cifs_reconnect(struct TCP_Server_Info *server) server->max_read = 0; #endif - cFYI(1, "Reconnecting tcp session"); + cifs_dbg(FYI, "Reconnecting tcp session\n"); /* before reconnecting the tcp session, mark the smb session (uid) and the tid bad so they are not used until reconnected */ - cFYI(1, "%s: marking sessions and tcons for reconnect", __func__); + cifs_dbg(FYI, "%s: marking sessions and tcons for reconnect\n", + __func__); spin_lock(&cifs_tcp_ses_lock); list_for_each(tmp, &server->smb_ses_list) { ses = list_entry(tmp, struct cifs_ses, smb_ses_list); @@ -337,15 +335,14 @@ cifs_reconnect(struct TCP_Server_Info *server) spin_unlock(&cifs_tcp_ses_lock); /* do not want to be sending data on a socket we are freeing */ - cFYI(1, "%s: tearing down socket", __func__); + cifs_dbg(FYI, "%s: tearing down socket\n", __func__); mutex_lock(&server->srv_mutex); if (server->ssocket) { - cFYI(1, "State: 0x%x Flags: 0x%lx", server->ssocket->state, - server->ssocket->flags); + cifs_dbg(FYI, "State: 0x%x Flags: 0x%lx\n", + server->ssocket->state, server->ssocket->flags); kernel_sock_shutdown(server->ssocket, SHUT_WR); - cFYI(1, "Post shutdown state: 0x%x Flags: 0x%lx", - server->ssocket->state, - server->ssocket->flags); + cifs_dbg(FYI, "Post shutdown state: 0x%x Flags: 0x%lx\n", + server->ssocket->state, server->ssocket->flags); sock_release(server->ssocket); server->ssocket = NULL; } @@ -359,7 +356,7 @@ cifs_reconnect(struct TCP_Server_Info *server) /* mark submitted MIDs for retry and issue callback */ INIT_LIST_HEAD(&retry_list); - cFYI(1, "%s: moving mids to private list", __func__); + cifs_dbg(FYI, "%s: moving mids to private list\n", __func__); spin_lock(&GlobalMid_Lock); list_for_each_safe(tmp, tmp2, &server->pending_mid_q) { mid_entry = list_entry(tmp, struct mid_q_entry, qhead); @@ -369,7 +366,7 @@ cifs_reconnect(struct TCP_Server_Info *server) } spin_unlock(&GlobalMid_Lock); - cFYI(1, "%s: issuing mid callbacks", __func__); + cifs_dbg(FYI, "%s: issuing mid callbacks\n", __func__); list_for_each_safe(tmp, tmp2, &retry_list) { mid_entry = list_entry(tmp, struct mid_q_entry, qhead); list_del_init(&mid_entry->qhead); @@ -382,7 +379,7 @@ cifs_reconnect(struct TCP_Server_Info *server) /* we should try only the port we connected to before */ rc = generic_ip_connect(server); if (rc) { - cFYI(1, "reconnect error %d", rc); + cifs_dbg(FYI, "reconnect error %d\n", rc); msleep(3000); } else { atomic_inc(&tcpSesReconnectCount); @@ -416,8 +413,8 @@ cifs_echo_request(struct work_struct *work) rc = server->ops->echo ? server->ops->echo(server) : -ENOSYS; if (rc) - cFYI(1, "Unable to send echo request to server: %s", - server->hostname); + cifs_dbg(FYI, "Unable to send echo request to server: %s\n", + server->hostname); requeue_echo: queue_delayed_work(cifsiod_wq, &server->echo, SMB_ECHO_INTERVAL); @@ -429,7 +426,7 @@ allocate_buffers(struct TCP_Server_Info *server) if (!server->bigbuf) { server->bigbuf = (char *)cifs_buf_get(); if (!server->bigbuf) { - cERROR(1, "No memory for large SMB response"); + cifs_dbg(VFS, "No memory for large SMB response\n"); msleep(3000); /* retry will check if exiting */ return false; @@ -442,7 +439,7 @@ allocate_buffers(struct TCP_Server_Info *server) if (!server->smallbuf) { server->smallbuf = (char *)cifs_small_buf_get(); if (!server->smallbuf) { - cERROR(1, "No memory for SMB response"); + cifs_dbg(VFS, "No memory for SMB response\n"); msleep(1000); /* retry will check if exiting */ return false; @@ -472,9 +469,8 @@ server_unresponsive(struct TCP_Server_Info *server) */ if (server->tcpStatus == CifsGood && time_after(jiffies, server->lstrp + 2 * SMB_ECHO_INTERVAL)) { - cERROR(1, "Server %s has not responded in %d seconds. " - "Reconnecting...", server->hostname, - (2 * SMB_ECHO_INTERVAL) / HZ); + cifs_dbg(VFS, "Server %s has not responded in %d seconds. Reconnecting...\n", + server->hostname, (2 * SMB_ECHO_INTERVAL) / HZ); cifs_reconnect(server); wake_up(&server->response_q); return true; @@ -585,8 +581,8 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig, length = 0; continue; } else if (length <= 0) { - cFYI(1, "Received no data or error: expecting %d " - "got %d", to_read, length); + cifs_dbg(FYI, "Received no data or error: expecting %d\n" + "got %d", to_read, length); cifs_reconnect(server); total_read = -EAGAIN; break; @@ -620,17 +616,17 @@ is_smb_response(struct TCP_Server_Info *server, unsigned char type) /* Regular SMB response */ return true; case RFC1002_SESSION_KEEP_ALIVE: - cFYI(1, "RFC 1002 session keep alive"); + cifs_dbg(FYI, "RFC 1002 session keep alive\n"); break; case RFC1002_POSITIVE_SESSION_RESPONSE: - cFYI(1, "RFC 1002 positive session response"); + cifs_dbg(FYI, "RFC 1002 positive session response\n"); break; case RFC1002_NEGATIVE_SESSION_RESPONSE: /* * We get this from Windows 98 instead of an error on * SMB negprot response. */ - cFYI(1, "RFC 1002 negative session response"); + cifs_dbg(FYI, "RFC 1002 negative session response\n"); /* give server a second to clean up */ msleep(1000); /* @@ -644,7 +640,7 @@ is_smb_response(struct TCP_Server_Info *server, unsigned char type) wake_up(&server->response_q); break; default: - cERROR(1, "RFC 1002 unknown response type 0x%x", type); + cifs_dbg(VFS, "RFC 1002 unknown response type 0x%x\n", type); cifs_reconnect(server); } @@ -730,7 +726,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) spin_lock(&GlobalMid_Lock); list_for_each_safe(tmp, tmp2, &server->pending_mid_q) { mid_entry = list_entry(tmp, struct mid_q_entry, qhead); - cFYI(1, "Clearing mid 0x%llx", mid_entry->mid); + cifs_dbg(FYI, "Clearing mid 0x%llx\n", mid_entry->mid); mid_entry->mid_state = MID_SHUTDOWN; list_move(&mid_entry->qhead, &dispose_list); } @@ -739,7 +735,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) /* now walk dispose list and issue callbacks */ list_for_each_safe(tmp, tmp2, &dispose_list) { mid_entry = list_entry(tmp, struct mid_q_entry, qhead); - cFYI(1, "Callback mid 0x%llx", mid_entry->mid); + cifs_dbg(FYI, "Callback mid 0x%llx\n", mid_entry->mid); list_del_init(&mid_entry->qhead); mid_entry->callback(mid_entry); } @@ -756,7 +752,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) * least 45 seconds before giving up on a request getting a * response and going ahead and killing cifsd. */ - cFYI(1, "Wait for exit from demultiplex thread"); + cifs_dbg(FYI, "Wait for exit from demultiplex thread\n"); msleep(46000); /* * If threads still have not exited they are probably never @@ -783,8 +779,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid) /* make sure this will fit in a large buffer */ if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server) - 4) { - cERROR(1, "SMB response too long (%u bytes)", - pdu_length); + cifs_dbg(VFS, "SMB response too long (%u bytes)\n", pdu_length); cifs_reconnect(server); wake_up(&server->response_q); return -EAGAIN; @@ -842,7 +837,7 @@ cifs_demultiplex_thread(void *p) struct mid_q_entry *mid_entry; current->flags |= PF_MEMALLOC; - cFYI(1, "Demultiplex PID: %d", task_pid_nr(current)); + cifs_dbg(FYI, "Demultiplex PID: %d\n", task_pid_nr(current)); length = atomic_inc_return(&tcpSesAllocCount); if (length > 1) @@ -872,14 +867,14 @@ cifs_demultiplex_thread(void *p) */ pdu_length = get_rfc1002_length(buf); - cFYI(1, "RFC1002 header 0x%x", pdu_length); + cifs_dbg(FYI, "RFC1002 header 0x%x\n", pdu_length); if (!is_smb_response(server, buf[0])) continue; /* make sure we have enough to get to the MID */ if (pdu_length < HEADER_SIZE(server) - 1 - 4) { - cERROR(1, "SMB response too short (%u bytes)", - pdu_length); + cifs_dbg(VFS, "SMB response too short (%u bytes)\n", + pdu_length); cifs_reconnect(server); wake_up(&server->response_q); continue; @@ -911,8 +906,8 @@ cifs_demultiplex_thread(void *p) mid_entry->callback(mid_entry); } else if (!server->ops->is_oplock_break || !server->ops->is_oplock_break(buf, server)) { - cERROR(1, "No task to wake, unknown frame received! " - "NumMids %d", atomic_read(&midCount)); + cifs_dbg(VFS, "No task to wake, unknown frame received! NumMids %d\n", + atomic_read(&midCount)); cifs_dump_mem("Received Data is: ", buf, HEADER_SIZE(server)); #ifdef CONFIG_CIFS_DEBUG2 @@ -1038,7 +1033,7 @@ static int cifs_parse_security_flavors(char *value, break; case Opt_sec_krb5p: /* vol->secFlg |= CIFSSEC_MUST_SEAL | CIFSSEC_MAY_KRB5; */ - cERROR(1, "Krb5 cifs privacy not supported"); + cifs_dbg(VFS, "Krb5 cifs privacy not supported\n"); break; case Opt_sec_ntlmssp: vol->secFlg |= CIFSSEC_MAY_NTLMSSP; @@ -1068,7 +1063,7 @@ static int cifs_parse_security_flavors(char *value, vol->nullauth = 1; break; default: - cERROR(1, "bad security option: %s", value); + cifs_dbg(VFS, "bad security option: %s\n", value); return 1; } @@ -1094,7 +1089,7 @@ cifs_parse_cache_flavor(char *value, struct smb_vol *vol) vol->strict_io = false; break; default: - cERROR(1, "bad cache= option: %s", value); + cifs_dbg(VFS, "bad cache= option: %s\n", value); return 1; } return 0; @@ -1125,7 +1120,7 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol) break; #endif default: - cERROR(1, "Unknown vers= option specified: %s", value); + cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value); return 1; } return 0; @@ -1256,7 +1251,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, separator[0] = options[4]; options += 5; } else { - cFYI(1, "Null separator not allowed"); + cifs_dbg(FYI, "Null separator not allowed\n"); } } vol->backupuid_specified = false; /* no backup intent for a user */ @@ -1441,8 +1436,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, break; case Opt_fsc: #ifndef CONFIG_CIFS_FSCACHE - cERROR(1, "FS-Cache support needs CONFIG_CIFS_FSCACHE " - "kernel config option set"); + cifs_dbg(VFS, "FS-Cache support needs CONFIG_CIFS_FSCACHE kernel config option set\n"); goto cifs_parse_mount_err; #endif vol->fsc = true; @@ -1460,55 +1454,55 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, /* Numeric Values */ case Opt_backupuid: if (get_option_uid(args, &vol->backupuid)) { - cERROR(1, "%s: Invalid backupuid value", - __func__); + cifs_dbg(VFS, "%s: Invalid backupuid value\n", + __func__); goto cifs_parse_mount_err; } vol->backupuid_specified = true; break; case Opt_backupgid: if (get_option_gid(args, &vol->backupgid)) { - cERROR(1, "%s: Invalid backupgid value", - __func__); + cifs_dbg(VFS, "%s: Invalid backupgid value\n", + __func__); goto cifs_parse_mount_err; } vol->backupgid_specified = true; break; case Opt_uid: if (get_option_uid(args, &vol->linux_uid)) { - cERROR(1, "%s: Invalid uid value", - __func__); + cifs_dbg(VFS, "%s: Invalid uid value\n", + __func__); goto cifs_parse_mount_err; } uid_specified = true; break; case Opt_cruid: if (get_option_uid(args, &vol->cred_uid)) { - cERROR(1, "%s: Invalid cruid value", - __func__); + cifs_dbg(VFS, "%s: Invalid cruid value\n", + __func__); goto cifs_parse_mount_err; } break; case Opt_gid: if (get_option_gid(args, &vol->linux_gid)) { - cERROR(1, "%s: Invalid gid value", - __func__); + cifs_dbg(VFS, "%s: Invalid gid value\n", + __func__); goto cifs_parse_mount_err; } gid_specified = true; break; case Opt_file_mode: if (get_option_ul(args, &option)) { - cERROR(1, "%s: Invalid file_mode value", - __func__); + cifs_dbg(VFS, "%s: Invalid file_mode value\n", + __func__); goto cifs_parse_mount_err; } vol->file_mode = option; break; case Opt_dirmode: if (get_option_ul(args, &option)) { - cERROR(1, "%s: Invalid dir_mode value", - __func__); + cifs_dbg(VFS, "%s: Invalid dir_mode value\n", + __func__); goto cifs_parse_mount_err; } vol->dir_mode = option; @@ -1516,37 +1510,37 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, case Opt_port: if (get_option_ul(args, &option) || option > USHRT_MAX) { - cERROR(1, "%s: Invalid port value", __func__); + cifs_dbg(VFS, "%s: Invalid port value\n", + __func__); goto cifs_parse_mount_err; } port = (unsigned short)option; break; case Opt_rsize: if (get_option_ul(args, &option)) { - cERROR(1, "%s: Invalid rsize value", - __func__); + cifs_dbg(VFS, "%s: Invalid rsize value\n", + __func__); goto cifs_parse_mount_err; } vol->rsize = option; break; case Opt_wsize: if (get_option_ul(args, &option)) { - cERROR(1, "%s: Invalid wsize value", - __func__); + cifs_dbg(VFS, "%s: Invalid wsize value\n", + __func__); goto cifs_parse_mount_err; } vol->wsize = option; break; case Opt_actimeo: if (get_option_ul(args, &option)) { - cERROR(1, "%s: Invalid actimeo value", - __func__); + cifs_dbg(VFS, "%s: Invalid actimeo value\n", + __func__); goto cifs_parse_mount_err; } vol->actimeo = HZ * option; if (vol->actimeo > CIFS_MAX_ACTIMEO) { - cERROR(1, "CIFS: attribute cache" - "timeout too large"); + cifs_dbg(VFS, "attribute cache timeout too large\n"); goto cifs_parse_mount_err; } break; @@ -1569,21 +1563,28 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, goto cifs_parse_mount_err; } vol->username = kstrdup(string, GFP_KERNEL); - if (!vol->username) { - printk(KERN_WARNING "CIFS: no memory " - "for username\n"); + if (!vol->username) goto cifs_parse_mount_err; - } break; case Opt_blank_pass: - vol->password = NULL; - break; - case Opt_pass: /* passwords have to be handled differently * to allow the character used for deliminator * to be passed within them */ + /* + * Check if this is a case where the password + * starts with a delimiter + */ + tmp_end = strchr(data, '='); + tmp_end++; + if (!(tmp_end < end && tmp_end[1] == delim)) { + /* No it is not. Set the password to NULL */ + vol->password = NULL; + break; + } + /* Yes it is. Drop down to Opt_pass below.*/ + case Opt_pass: /* Obtain the value string */ value = strchr(data, '='); value++; @@ -1651,30 +1652,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, } got_ip = true; break; - case Opt_unc: - string = vol->UNC; - vol->UNC = match_strdup(args); - if (vol->UNC == NULL) - goto out_nomem; - - convert_delimiter(vol->UNC, '\\'); - if (vol->UNC[0] != '\\' || vol->UNC[1] != '\\') { - printk(KERN_ERR "CIFS: UNC Path does not " - "begin with // or \\\\\n"); - goto cifs_parse_mount_err; - } - - /* Compare old unc= option to new one */ - if (!string || strcmp(string, vol->UNC)) - printk(KERN_WARNING "CIFS: the value of the " - "unc= mount option does not match the " - "device string. Using the unc= option " - "for now. In 3.10, that option will " - "be ignored and the contents of the " - "device string will be used " - "instead. (%s != %s)\n", string, - vol->UNC); - break; case Opt_domain: string = match_strdup(args); if (string == NULL) @@ -1692,7 +1669,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, "for domainname\n"); goto cifs_parse_mount_err; } - cFYI(1, "Domain name set"); + cifs_dbg(FYI, "Domain name set\n"); break; case Opt_srcaddr: string = match_strdup(args); @@ -1707,26 +1684,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, goto cifs_parse_mount_err; } break; - case Opt_prefixpath: - /* skip over any leading delimiter */ - if (*args[0].from == '/' || *args[0].from == '\\') - args[0].from++; - - string = vol->prepath; - vol->prepath = match_strdup(args); - if (vol->prepath == NULL) - goto out_nomem; - /* Compare old prefixpath= option to new one */ - if (!string || strcmp(string, vol->prepath)) - printk(KERN_WARNING "CIFS: the value of the " - "prefixpath= mount option does not " - "match the device string. Using the " - "prefixpath= option for now. In 3.10, " - "that option will be ignored and the " - "contents of the device string will be " - "used instead.(%s != %s)\n", string, - vol->prepath); - break; case Opt_iocharset: string = match_strdup(args); if (string == NULL) @@ -1750,20 +1707,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, /* if iocharset not set then load_nls_default * is used by caller */ - cFYI(1, "iocharset set to %s", string); - break; - case Opt_sockopt: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - - if (strnicmp(string, "TCP_NODELAY", 11) == 0) { - printk(KERN_WARNING "CIFS: the " - "sockopt=TCP_NODELAY option has been " - "deprecated and will be removed " - "in 3.9\n"); - vol->sockopt_tcp_nodelay = 1; - } + cifs_dbg(FYI, "iocharset set to %s\n", string); break; case Opt_netbiosname: string = match_strdup(args); @@ -1877,20 +1821,18 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, #ifndef CONFIG_KEYS /* Muliuser mounts require CONFIG_KEYS support */ if (vol->multiuser) { - cERROR(1, "Multiuser mounts require kernels with " - "CONFIG_KEYS enabled."); + cifs_dbg(VFS, "Multiuser mounts require kernels with CONFIG_KEYS enabled\n"); goto cifs_parse_mount_err; } #endif if (!vol->UNC) { - cERROR(1, "CIFS mount error: No usable UNC path provided in " - "device string or in unc= option!"); + cifs_dbg(VFS, "CIFS mount error: No usable UNC path provided in device string or in unc= option!\n"); goto cifs_parse_mount_err; } /* make sure UNC has a share name */ if (!strchr(vol->UNC + 3, '\\')) { - cERROR(1, "Malformed UNC. Unable to find share name."); + cifs_dbg(VFS, "Malformed UNC. Unable to find share name.\n"); goto cifs_parse_mount_err; } @@ -2111,7 +2053,7 @@ cifs_find_tcp_session(struct smb_vol *vol) ++server->srv_count; spin_unlock(&cifs_tcp_ses_lock); - cFYI(1, "Existing tcp session with server found"); + cifs_dbg(FYI, "Existing tcp session with server found\n"); return server; } spin_unlock(&cifs_tcp_ses_lock); @@ -2158,7 +2100,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) struct TCP_Server_Info *tcp_ses = NULL; int rc; - cFYI(1, "UNC: %s", volume_info->UNC); + cifs_dbg(FYI, "UNC: %s\n", volume_info->UNC); /* see if we already have a matching tcp_ses */ tcp_ses = cifs_find_tcp_session(volume_info); @@ -2173,7 +2115,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) rc = cifs_crypto_shash_allocate(tcp_ses); if (rc) { - cERROR(1, "could not setup hash structures rc %d", rc); + cifs_dbg(VFS, "could not setup hash structures rc %d\n", rc); goto out_err; } @@ -2220,7 +2162,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) rc = ip_connect(tcp_ses); if (rc < 0) { - cERROR(1, "Error connecting to socket. Aborting operation"); + cifs_dbg(VFS, "Error connecting to socket. Aborting operation.\n"); goto out_err_crypto_release; } @@ -2233,7 +2175,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) tcp_ses, "cifsd"); if (IS_ERR(tcp_ses->tsk)) { rc = PTR_ERR(tcp_ses->tsk); - cERROR(1, "error %d create cifsd thread", rc); + cifs_dbg(VFS, "error %d create cifsd thread\n", rc); module_put(THIS_MODULE); goto out_err_crypto_release; } @@ -2320,7 +2262,7 @@ cifs_put_smb_ses(struct cifs_ses *ses) unsigned int xid; struct TCP_Server_Info *server = ses->server; - cFYI(1, "%s: ses_count=%d", __func__, ses->ses_count); + cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count); spin_lock(&cifs_tcp_ses_lock); if (--ses->ses_count > 0) { spin_unlock(&cifs_tcp_ses_lock); @@ -2372,23 +2314,24 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses) sprintf(desc, "cifs:a:%pI6c", &sa6->sin6_addr.s6_addr); break; default: - cFYI(1, "Bad ss_family (%hu)", server->dstaddr.ss_family); + cifs_dbg(FYI, "Bad ss_family (%hu)\n", + server->dstaddr.ss_family); rc = -EINVAL; goto out_err; } - cFYI(1, "%s: desc=%s", __func__, desc); + cifs_dbg(FYI, "%s: desc=%s\n", __func__, desc); key = request_key(&key_type_logon, desc, ""); if (IS_ERR(key)) { if (!ses->domainName) { - cFYI(1, "domainName is NULL"); + cifs_dbg(FYI, "domainName is NULL\n"); rc = PTR_ERR(key); goto out_err; } /* didn't work, try to find a domain key */ sprintf(desc, "cifs:d:%s", ses->domainName); - cFYI(1, "%s: desc=%s", __func__, desc); + cifs_dbg(FYI, "%s: desc=%s\n", __func__, desc); key = request_key(&key_type_logon, desc, ""); if (IS_ERR(key)) { rc = PTR_ERR(key); @@ -2406,32 +2349,34 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses) /* find first : in payload */ payload = (char *)upayload->data; delim = strnchr(payload, upayload->datalen, ':'); - cFYI(1, "payload=%s", payload); + cifs_dbg(FYI, "payload=%s\n", payload); if (!delim) { - cFYI(1, "Unable to find ':' in payload (datalen=%d)", - upayload->datalen); + cifs_dbg(FYI, "Unable to find ':' in payload (datalen=%d)\n", + upayload->datalen); rc = -EINVAL; goto out_key_put; } len = delim - payload; if (len > MAX_USERNAME_SIZE || len <= 0) { - cFYI(1, "Bad value from username search (len=%zd)", len); + cifs_dbg(FYI, "Bad value from username search (len=%zd)\n", + len); rc = -EINVAL; goto out_key_put; } vol->username = kstrndup(payload, len, GFP_KERNEL); if (!vol->username) { - cFYI(1, "Unable to allocate %zd bytes for username", len); + cifs_dbg(FYI, "Unable to allocate %zd bytes for username\n", + len); rc = -ENOMEM; goto out_key_put; } - cFYI(1, "%s: username=%s", __func__, vol->username); + cifs_dbg(FYI, "%s: username=%s\n", __func__, vol->username); len = key->datalen - (len + 1); if (len > MAX_PASSWORD_SIZE || len <= 0) { - cFYI(1, "Bad len for password search (len=%zd)", len); + cifs_dbg(FYI, "Bad len for password search (len=%zd)\n", len); rc = -EINVAL; kfree(vol->username); vol->username = NULL; @@ -2441,7 +2386,8 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses) ++delim; vol->password = kstrndup(delim, len, GFP_KERNEL); if (!vol->password) { - cFYI(1, "Unable to allocate %zd bytes for password", len); + cifs_dbg(FYI, "Unable to allocate %zd bytes for password\n", + len); rc = -ENOMEM; kfree(vol->username); vol->username = NULL; @@ -2453,7 +2399,7 @@ out_key_put: key_put(key); out_err: kfree(desc); - cFYI(1, "%s: returning %d", __func__, rc); + cifs_dbg(FYI, "%s: returning %d\n", __func__, rc); return rc; } #else /* ! CONFIG_KEYS */ @@ -2478,7 +2424,8 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) ses = cifs_find_smb_ses(server, volume_info); if (ses) { - cFYI(1, "Existing smb sess found (status=%d)", ses->status); + cifs_dbg(FYI, "Existing smb sess found (status=%d)\n", + ses->status); mutex_lock(&ses->session_mutex); rc = cifs_negotiate_protocol(xid, ses); @@ -2490,7 +2437,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) return ERR_PTR(rc); } if (ses->need_reconnect) { - cFYI(1, "Session needs reconnect"); + cifs_dbg(FYI, "Session needs reconnect\n"); rc = cifs_setup_session(xid, ses, volume_info->local_nls); if (rc) { @@ -2509,7 +2456,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) return ses; } - cFYI(1, "Existing smb sess not found"); + cifs_dbg(FYI, "Existing smb sess not found\n"); ses = sesInfoAlloc(); if (ses == NULL) goto get_ses_fail; @@ -2599,7 +2546,7 @@ cifs_put_tcon(struct cifs_tcon *tcon) unsigned int xid; struct cifs_ses *ses = tcon->ses; - cFYI(1, "%s: tc_count=%d", __func__, tcon->tc_count); + cifs_dbg(FYI, "%s: tc_count=%d\n", __func__, tcon->tc_count); spin_lock(&cifs_tcp_ses_lock); if (--tcon->tc_count > 0) { spin_unlock(&cifs_tcp_ses_lock); @@ -2627,12 +2574,11 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) tcon = cifs_find_tcon(ses, volume_info->UNC); if (tcon) { - cFYI(1, "Found match on UNC path"); + cifs_dbg(FYI, "Found match on UNC path\n"); /* existing tcon already has a reference */ cifs_put_smb_ses(ses); if (tcon->seal != volume_info->seal) - cERROR(1, "transport encryption setting " - "conflicts with existing tid"); + cifs_dbg(VFS, "transport encryption setting conflicts with existing tid\n"); return tcon; } @@ -2664,13 +2610,13 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) rc = ses->server->ops->tree_connect(xid, ses, volume_info->UNC, tcon, volume_info->local_nls); free_xid(xid); - cFYI(1, "Tcon rc = %d", rc); + cifs_dbg(FYI, "Tcon rc = %d\n", rc); if (rc) goto out_fail; if (volume_info->nodfs) { tcon->Flags &= ~SMB_SHARE_IS_IN_DFS; - cFYI(1, "DFS disabled (%d)", tcon->Flags); + cifs_dbg(FYI, "DFS disabled (%d)\n", tcon->Flags); } tcon->seal = volume_info->seal; /* @@ -2824,7 +2770,7 @@ get_dfs_path(const unsigned int xid, struct cifs_ses *ses, const char *old_path, strcpy(temp_unc + 2 + strlen(ses->serverName), "\\IPC$"); rc = ses->server->ops->tree_connect(xid, ses, temp_unc, NULL, nls_codepage); - cFYI(1, "Tcon rc = %d ipc_tid = %d", rc, ses->ipc_tid); + cifs_dbg(FYI, "Tcon rc = %d ipc_tid = %d\n", rc, ses->ipc_tid); kfree(temp_unc); } if (rc == 0) @@ -2902,13 +2848,11 @@ bind_socket(struct TCP_Server_Info *server) saddr4 = (struct sockaddr_in *)&server->srcaddr; saddr6 = (struct sockaddr_in6 *)&server->srcaddr; if (saddr6->sin6_family == AF_INET6) - cERROR(1, "cifs: " - "Failed to bind to: %pI6c, error: %d", - &saddr6->sin6_addr, rc); + cifs_dbg(VFS, "Failed to bind to: %pI6c, error: %d\n", + &saddr6->sin6_addr, rc); else - cERROR(1, "cifs: " - "Failed to bind to: %pI4, error: %d", - &saddr4->sin_addr.s_addr, rc); + cifs_dbg(VFS, "Failed to bind to: %pI4, error: %d\n", + &saddr4->sin_addr.s_addr, rc); } } return rc; @@ -3013,13 +2957,13 @@ generic_ip_connect(struct TCP_Server_Info *server) rc = __sock_create(cifs_net_ns(server), sfamily, SOCK_STREAM, IPPROTO_TCP, &socket, 1); if (rc < 0) { - cERROR(1, "Error %d creating socket", rc); + cifs_dbg(VFS, "Error %d creating socket\n", rc); server->ssocket = NULL; return rc; } /* BB other socket options to set KEEPALIVE, NODELAY? */ - cFYI(1, "Socket created"); + cifs_dbg(FYI, "Socket created\n"); server->ssocket = socket; socket->sk->sk_allocation = GFP_NOFS; if (sfamily == AF_INET6) @@ -3053,16 +2997,17 @@ generic_ip_connect(struct TCP_Server_Info *server) rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY, (char *)&val, sizeof(val)); if (rc) - cFYI(1, "set TCP_NODELAY socket option error %d", rc); + cifs_dbg(FYI, "set TCP_NODELAY socket option error %d\n", + rc); } - cFYI(1, "sndbuf %d rcvbuf %d rcvtimeo 0x%lx", + cifs_dbg(FYI, "sndbuf %d rcvbuf %d rcvtimeo 0x%lx\n", socket->sk->sk_sndbuf, socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo); rc = socket->ops->connect(socket, saddr, slen, 0); if (rc < 0) { - cFYI(1, "Error %d connecting to server", rc); + cifs_dbg(FYI, "Error %d connecting to server\n", rc); sock_release(socket); server->ssocket = NULL; return rc; @@ -3120,19 +3065,19 @@ void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon, if (vol_info && vol_info->no_linux_ext) { tcon->fsUnixInfo.Capability = 0; tcon->unix_ext = 0; /* Unix Extensions disabled */ - cFYI(1, "Linux protocol extensions disabled"); + cifs_dbg(FYI, "Linux protocol extensions disabled\n"); return; } else if (vol_info) tcon->unix_ext = 1; /* Unix Extensions supported */ if (tcon->unix_ext == 0) { - cFYI(1, "Unix extensions disabled so not set on reconnect"); + cifs_dbg(FYI, "Unix extensions disabled so not set on reconnect\n"); return; } if (!CIFSSMBQFSUnixInfo(xid, tcon)) { __u64 cap = le64_to_cpu(tcon->fsUnixInfo.Capability); - cFYI(1, "unix caps which server supports %lld", cap); + cifs_dbg(FYI, "unix caps which server supports %lld\n", cap); /* check for reconnect case in which we do not want to change the mount behavior if we can avoid it */ if (vol_info == NULL) { @@ -3142,22 +3087,22 @@ void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon, cap &= ~CIFS_UNIX_POSIX_ACL_CAP; if ((saved_cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) { if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) - cERROR(1, "POSIXPATH support change"); + cifs_dbg(VFS, "POSIXPATH support change\n"); cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP; } else if ((cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) { - cERROR(1, "possible reconnect error"); - cERROR(1, "server disabled POSIX path support"); + cifs_dbg(VFS, "possible reconnect error\n"); + cifs_dbg(VFS, "server disabled POSIX path support\n"); } } if (cap & CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP) - cERROR(1, "per-share encryption not supported yet"); + cifs_dbg(VFS, "per-share encryption not supported yet\n"); cap &= CIFS_UNIX_CAP_MASK; if (vol_info && vol_info->no_psx_acl) cap &= ~CIFS_UNIX_POSIX_ACL_CAP; else if (CIFS_UNIX_POSIX_ACL_CAP & cap) { - cFYI(1, "negotiated posix acl support"); + cifs_dbg(FYI, "negotiated posix acl support\n"); if (cifs_sb) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIXACL; @@ -3166,43 +3111,38 @@ void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon, if (vol_info && vol_info->posix_paths == 0) cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP; else if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) { - cFYI(1, "negotiate posix pathnames"); + cifs_dbg(FYI, "negotiate posix pathnames\n"); if (cifs_sb) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS; } - cFYI(1, "Negotiate caps 0x%x", (int)cap); + cifs_dbg(FYI, "Negotiate caps 0x%x\n", (int)cap); #ifdef CONFIG_CIFS_DEBUG2 if (cap & CIFS_UNIX_FCNTL_CAP) - cFYI(1, "FCNTL cap"); + cifs_dbg(FYI, "FCNTL cap\n"); if (cap & CIFS_UNIX_EXTATTR_CAP) - cFYI(1, "EXTATTR cap"); + cifs_dbg(FYI, "EXTATTR cap\n"); if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) - cFYI(1, "POSIX path cap"); + cifs_dbg(FYI, "POSIX path cap\n"); if (cap & CIFS_UNIX_XATTR_CAP) - cFYI(1, "XATTR cap"); + cifs_dbg(FYI, "XATTR cap\n"); if (cap & CIFS_UNIX_POSIX_ACL_CAP) - cFYI(1, "POSIX ACL cap"); + cifs_dbg(FYI, "POSIX ACL cap\n"); if (cap & CIFS_UNIX_LARGE_READ_CAP) - cFYI(1, "very large read cap"); + cifs_dbg(FYI, "very large read cap\n"); if (cap & CIFS_UNIX_LARGE_WRITE_CAP) - cFYI(1, "very large write cap"); + cifs_dbg(FYI, "very large write cap\n"); if (cap & CIFS_UNIX_TRANSPORT_ENCRYPTION_CAP) - cFYI(1, "transport encryption cap"); + cifs_dbg(FYI, "transport encryption cap\n"); if (cap & CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP) - cFYI(1, "mandatory transport encryption cap"); + cifs_dbg(FYI, "mandatory transport encryption cap\n"); #endif /* CIFS_DEBUG2 */ if (CIFSSMBSetFSUnixInfo(xid, tcon, cap)) { if (vol_info == NULL) { - cFYI(1, "resetting capabilities failed"); + cifs_dbg(FYI, "resetting capabilities failed\n"); } else - cERROR(1, "Negotiating Unix capabilities " - "with the server failed. Consider " - "mounting with the Unix Extensions " - "disabled if problems are found " - "by specifying the nounix mount " - "option."); + cifs_dbg(VFS, "Negotiating Unix capabilities with the server failed. Consider mounting with the Unix Extensions disabled if problems are found by specifying the nounix mount option.\n"); } } @@ -3227,8 +3167,8 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, cifs_sb->mnt_gid = pvolume_info->linux_gid; cifs_sb->mnt_file_mode = pvolume_info->file_mode; cifs_sb->mnt_dir_mode = pvolume_info->dir_mode; - cFYI(1, "file mode: 0x%hx dir mode: 0x%hx", - cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode); + cifs_dbg(FYI, "file mode: 0x%hx dir mode: 0x%hx\n", + cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode); cifs_sb->actimeo = pvolume_info->actimeo; cifs_sb->local_nls = pvolume_info->local_nls; @@ -3277,21 +3217,19 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, if (pvolume_info->strict_io) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_STRICT_IO; if (pvolume_info->direct_io) { - cFYI(1, "mounting share using direct i/o"); + cifs_dbg(FYI, "mounting share using direct i/o\n"); cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO; } if (pvolume_info->mfsymlinks) { if (pvolume_info->sfu_emul) { - cERROR(1, "mount option mfsymlinks ignored if sfu " - "mount option is used"); + cifs_dbg(VFS, "mount option mfsymlinks ignored if sfu mount option is used\n"); } else { cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MF_SYMLINKS; } } if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm)) - cERROR(1, "mount option dynperm ignored if cifsacl " - "mount option supported"); + cifs_dbg(VFS, "mount option dynperm ignored if cifsacl mount option supported\n"); } static void @@ -3343,7 +3281,7 @@ build_unc_path_to_root(const struct smb_vol *vol, *pos = '\0'; /* add trailing null */ convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb)); - cFYI(1, "%s: full_path=%s", __func__, full_path); + cifs_dbg(FYI, "%s: full_path=%s\n", __func__, full_path); return full_path; } @@ -3414,14 +3352,14 @@ cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data, return -EINVAL; if (volume_info->nullauth) { - cFYI(1, "Anonymous login"); + cifs_dbg(FYI, "Anonymous login\n"); kfree(volume_info->username); volume_info->username = NULL; } else if (volume_info->username) { /* BB fixme parse for domain name here */ - cFYI(1, "Username: %s", volume_info->username); + cifs_dbg(FYI, "Username: %s\n", volume_info->username); } else { - cifserror("No username specified"); + cifs_dbg(VFS, "No username specified\n"); /* In userspace mount helper we can get user name from alternate locations such as env variables and files on disk */ return -EINVAL; @@ -3434,7 +3372,7 @@ cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data, } else { volume_info->local_nls = load_nls(volume_info->iocharset); if (volume_info->local_nls == NULL) { - cERROR(1, "CIFS mount error: iocharset %s not found", + cifs_dbg(VFS, "CIFS mount error: iocharset %s not found\n", volume_info->iocharset); return -ELIBACC; } @@ -3784,13 +3722,13 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, if (length == 3) { if ((bcc_ptr[0] == 'I') && (bcc_ptr[1] == 'P') && (bcc_ptr[2] == 'C')) { - cFYI(1, "IPC connection"); + cifs_dbg(FYI, "IPC connection\n"); tcon->ipc = 1; } } else if (length == 2) { if ((bcc_ptr[0] == 'A') && (bcc_ptr[1] == ':')) { /* the most common case */ - cFYI(1, "disk share connection"); + cifs_dbg(FYI, "disk share connection\n"); } } bcc_ptr += length + 1; @@ -3803,7 +3741,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, bytes_left, is_unicode, nls_codepage); - cFYI(1, "nativeFileSystem=%s", tcon->nativeFileSystem); + cifs_dbg(FYI, "nativeFileSystem=%s\n", tcon->nativeFileSystem); if ((smb_buffer_response->WordCount == 3) || (smb_buffer_response->WordCount == 7)) @@ -3811,7 +3749,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, tcon->Flags = le16_to_cpu(pSMBr->OptionalSupport); else tcon->Flags = 0; - cFYI(1, "Tcon flags: 0x%x ", tcon->Flags); + cifs_dbg(FYI, "Tcon flags: 0x%x\n", tcon->Flags); } else if ((rc == 0) && tcon == NULL) { /* all we need to save for IPC$ connection */ ses->ipc_tid = smb_buffer_response->Tid; @@ -3889,16 +3827,16 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, if (linuxExtEnabled == 0) ses->capabilities &= (~server->vals->cap_unix); - cFYI(1, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d", + cifs_dbg(FYI, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d\n", server->sec_mode, server->capabilities, server->timeAdj); if (server->ops->sess_setup) rc = server->ops->sess_setup(xid, ses, nls_info); if (rc) { - cERROR(1, "Send error in SessSetup = %d", rc); + cifs_dbg(VFS, "Send error in SessSetup = %d\n", rc); } else { - mutex_lock(&ses->server->srv_mutex); + mutex_lock(&server->srv_mutex); if (!server->session_estab) { server->session_key.response = ses->auth_key.response; server->session_key.len = ses->auth_key.len; @@ -3908,7 +3846,7 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, } mutex_unlock(&server->srv_mutex); - cFYI(1, "CIFS Session Established successfully"); + cifs_dbg(FYI, "CIFS Session Established successfully\n"); spin_lock(&GlobalMid_Lock); ses->status = CifsGood; ses->need_reconnect = false; diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 1cd016217448..5699b5036ed8 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -102,7 +102,7 @@ cifs_bp_rename_retry: namelen += (1 + temp->d_name.len); temp = temp->d_parent; if (temp == NULL) { - cERROR(1, "corrupt dentry"); + cifs_dbg(VFS, "corrupt dentry\n"); rcu_read_unlock(); return NULL; } @@ -124,12 +124,12 @@ cifs_bp_rename_retry: full_path[namelen] = dirsep; strncpy(full_path + namelen + 1, temp->d_name.name, temp->d_name.len); - cFYI(0, "name: %s", full_path + namelen); + cifs_dbg(FYI, "name: %s\n", full_path + namelen); } spin_unlock(&temp->d_lock); temp = temp->d_parent; if (temp == NULL) { - cERROR(1, "corrupt dentry"); + cifs_dbg(VFS, "corrupt dentry\n"); rcu_read_unlock(); kfree(full_path); return NULL; @@ -137,8 +137,8 @@ cifs_bp_rename_retry: } rcu_read_unlock(); if (namelen != dfsplen || read_seqretry(&rename_lock, seq)) { - cFYI(1, "did not end path lookup where expected. namelen=%d " - "dfsplen=%d", namelen, dfsplen); + cifs_dbg(FYI, "did not end path lookup where expected. namelen=%ddfsplen=%d\n", + namelen, dfsplen); /* presumably this is only possible if racing with a rename of one of the parent directories (we can not lock the dentries above us to prevent this, but retrying should be harmless) */ @@ -178,7 +178,7 @@ check_name(struct dentry *direntry) if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)) { for (i = 0; i < direntry->d_name.len; i++) { if (direntry->d_name.name[i] == '\\') { - cFYI(1, "Invalid file name"); + cifs_dbg(FYI, "Invalid file name\n"); return -EINVAL; } } @@ -291,7 +291,7 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, else if ((oflags & O_CREAT) == O_CREAT) disposition = FILE_OPEN_IF; else - cFYI(1, "Create flag not set in create function"); + cifs_dbg(FYI, "Create flag not set in create function\n"); /* * BB add processing to set equivalent of mode - e.g. via CreateX with @@ -323,7 +323,7 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, desired_access, create_options, fid, oplock, buf, cifs_sb); if (rc) { - cFYI(1, "cifs_create returned 0x%x", rc); + cifs_dbg(FYI, "cifs_create returned 0x%x\n", rc); goto out; } @@ -389,7 +389,8 @@ cifs_create_get_file_info: cifs_create_set_dentry: if (rc != 0) { - cFYI(1, "Create worked, get_inode_info failed rc = %d", rc); + cifs_dbg(FYI, "Create worked, get_inode_info failed rc = %d\n", + rc); if (server->ops->close) server->ops->close(xid, tcon, fid); goto out; @@ -452,12 +453,14 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, xid = get_xid(); - cFYI(1, "parent inode = 0x%p name is: %s and dentry = 0x%p", - inode, direntry->d_name.name, direntry); + cifs_dbg(FYI, "parent inode = 0x%p name is: %s and dentry = 0x%p\n", + inode, direntry->d_name.name, direntry); tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb)); - if (IS_ERR(tlink)) + if (IS_ERR(tlink)) { + rc = PTR_ERR(tlink); goto out_free_xid; + } tcon = tlink_tcon(tlink); server = tcon->ses->server; @@ -518,8 +521,8 @@ int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode, __u32 oplock; int created = FILE_CREATED; - cFYI(1, "cifs_create parent inode = 0x%p name is: %s and dentry = 0x%p", - inode, direntry->d_name.name, direntry); + cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %s and dentry = 0x%p\n", + inode, direntry->d_name.name, direntry); tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb)); rc = PTR_ERR(tlink); @@ -613,7 +616,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, goto mknod_out; - cFYI(1, "sfu compat create special file"); + cifs_dbg(FYI, "sfu compat create special file\n"); buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); if (buf == NULL) { @@ -688,8 +691,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, xid = get_xid(); - cFYI(1, "parent inode = 0x%p name is: %s and dentry = 0x%p", - parent_dir_inode, direntry->d_name.name, direntry); + cifs_dbg(FYI, "parent inode = 0x%p name is: %s and dentry = 0x%p\n", + parent_dir_inode, direntry->d_name.name, direntry); /* check whether path exists */ @@ -715,11 +718,12 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, } if (direntry->d_inode != NULL) { - cFYI(1, "non-NULL inode in lookup"); + cifs_dbg(FYI, "non-NULL inode in lookup\n"); } else { - cFYI(1, "NULL inode in lookup"); + cifs_dbg(FYI, "NULL inode in lookup\n"); } - cFYI(1, "Full path: %s inode = 0x%p", full_path, direntry->d_inode); + cifs_dbg(FYI, "Full path: %s inode = 0x%p\n", + full_path, direntry->d_inode); if (pTcon->unix_ext) { rc = cifs_get_inode_info_unix(&newInode, full_path, @@ -742,7 +746,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, /* if it was once a directory (but how can we tell?) we could do shrink_dcache_parent(direntry); */ } else if (rc != -EACCES) { - cERROR(1, "Unexpected lookup error %d", rc); + cifs_dbg(VFS, "Unexpected lookup error %d\n", rc); /* We special case check for Access Denied - since that is a common return code */ } @@ -807,7 +811,7 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags) { int rc = 0; - cFYI(1, "In cifs d_delete, name = %s", direntry->d_name.name); + cifs_dbg(FYI, "In cifs d_delete, name = %s\n", direntry->d_name.name); return rc; } */ diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c index 1d2d91d9bf65..e7512e497611 100644 --- a/fs/cifs/dns_resolve.c +++ b/fs/cifs/dns_resolve.c @@ -55,7 +55,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr) len = strlen(unc); if (len < 3) { - cFYI(1, "%s: unc is too short: %s", __func__, unc); + cifs_dbg(FYI, "%s: unc is too short: %s\n", __func__, unc); return -EINVAL; } @@ -68,8 +68,8 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr) if (sep) len = sep - hostname; else - cFYI(1, "%s: probably server name is whole unc: %s", - __func__, unc); + cifs_dbg(FYI, "%s: probably server name is whole unc: %s\n", + __func__, unc); /* Try to interpret hostname as an IPv4 or IPv6 address */ rc = cifs_convert_address((struct sockaddr *)&ss, hostname, len); @@ -79,11 +79,11 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr) /* Perform the upcall */ rc = dns_query(NULL, hostname, len, NULL, ip_addr, NULL); if (rc < 0) - cFYI(1, "%s: unable to resolve: %*.*s", - __func__, len, len, hostname); + cifs_dbg(FYI, "%s: unable to resolve: %*.*s\n", + __func__, len, len, hostname); else - cFYI(1, "%s: resolved: %*.*s to %s", - __func__, len, len, hostname, *ip_addr); + cifs_dbg(FYI, "%s: resolved: %*.*s to %s\n", + __func__, len, len, hostname, *ip_addr); return rc; name_is_IP_address: @@ -92,7 +92,8 @@ name_is_IP_address: return -ENOMEM; memcpy(name, hostname, len); name[len] = 0; - cFYI(1, "%s: unc is IP, skipping dns upcall: %s", __func__, name); + cifs_dbg(FYI, "%s: unc is IP, skipping dns upcall: %s\n", + __func__, name); *ip_addr = name; return 0; } diff --git a/fs/cifs/export.c b/fs/cifs/export.c index 9c7ecdccf2f3..ce8b7f677c58 100644 --- a/fs/cifs/export.c +++ b/fs/cifs/export.c @@ -49,7 +49,7 @@ static struct dentry *cifs_get_parent(struct dentry *dentry) { /* BB need to add code here eventually to enable export via NFSD */ - cFYI(1, "get parent for %p", dentry); + cifs_dbg(FYI, "get parent for %p\n", dentry); return ERR_PTR(-EACCES); } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 8c0d85577314..48b29d24c9f4 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -78,9 +78,8 @@ static u32 cifs_posix_convert_flags(unsigned int flags) if (flags & O_EXCL) posix_flags |= SMB_O_EXCL; } else if (flags & O_EXCL) - cFYI(1, "Application %s pid %d has incorrectly set O_EXCL flag" - "but not O_CREAT on file open. Ignoring O_EXCL", - current->comm, current->tgid); + cifs_dbg(FYI, "Application %s pid %d has incorrectly set O_EXCL flag but not O_CREAT on file open. Ignoring O_EXCL\n", + current->comm, current->tgid); if (flags & O_TRUNC) posix_flags |= SMB_O_TRUNC; @@ -123,7 +122,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode, struct tcon_link *tlink; struct cifs_tcon *tcon; - cFYI(1, "posix open %s", full_path); + cifs_dbg(FYI, "posix open %s\n", full_path); presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL); if (presp_data == NULL) @@ -300,13 +299,15 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, INIT_WORK(&cfile->oplock_break, cifs_oplock_break); mutex_init(&cfile->fh_mutex); + cifs_sb_active(inode->i_sb); + /* * If the server returned a read oplock and we have mandatory brlocks, * set oplock level to None. */ if (oplock == server->vals->oplock_read && cifs_has_mand_locks(cinode)) { - cFYI(1, "Reset oplock val from read to None due to mand locks"); + cifs_dbg(FYI, "Reset oplock val from read to None due to mand locks\n"); oplock = 0; } @@ -349,7 +350,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink); struct TCP_Server_Info *server = tcon->ses->server; struct cifsInodeInfo *cifsi = CIFS_I(inode); - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct super_block *sb = inode->i_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifsLockInfo *li, *tmp; struct cifs_fid fid; struct cifs_pending_open open; @@ -371,8 +373,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) list_del(&cifs_file->tlist); if (list_empty(&cifsi->openFileList)) { - cFYI(1, "closing last open instance for inode %p", - cifs_file->dentry->d_inode); + cifs_dbg(FYI, "closing last open instance for inode %p\n", + cifs_file->dentry->d_inode); /* * In strict cache mode we need invalidate mapping on the last * close because it may cause a error when we open this file @@ -414,6 +416,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) cifs_put_tlink(cifs_file->tlink); dput(cifs_file->dentry); + cifs_sb_deactive(sb); kfree(cifs_file); } @@ -450,7 +453,7 @@ int cifs_open(struct inode *inode, struct file *file) goto out; } - cFYI(1, "inode = 0x%p file flags are 0x%x for %s", + cifs_dbg(FYI, "inode = 0x%p file flags are 0x%x for %s\n", inode, file->f_flags, full_path); if (server->oplocks) @@ -466,16 +469,13 @@ int cifs_open(struct inode *inode, struct file *file) cifs_sb->mnt_file_mode /* ignored */, file->f_flags, &oplock, &fid.netfid, xid); if (rc == 0) { - cFYI(1, "posix open succeeded"); + cifs_dbg(FYI, "posix open succeeded\n"); posix_open_ok = true; } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { if (tcon->ses->serverNOS) - cERROR(1, "server %s of type %s returned" - " unexpected error on SMB posix open" - ", disabling posix open support." - " Check if server update available.", - tcon->ses->serverName, - tcon->ses->serverNOS); + cifs_dbg(VFS, "server %s of type %s returned unexpected error on SMB posix open, disabling posix open support. Check if server update available.\n", + tcon->ses->serverName, + tcon->ses->serverNOS); tcon->broken_posix_open = true; } else if ((rc != -EIO) && (rc != -EREMOTE) && (rc != -EOPNOTSUPP)) /* path not found or net err */ @@ -617,8 +617,8 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) return rc; } - cFYI(1, "inode = 0x%p file flags 0x%x for %s", inode, cfile->f_flags, - full_path); + cifs_dbg(FYI, "inode = 0x%p file flags 0x%x for %s\n", + inode, cfile->f_flags, full_path); if (tcon->ses->server->oplocks) oplock = REQ_OPLOCK; @@ -639,7 +639,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) cifs_sb->mnt_file_mode /* ignored */, oflags, &oplock, &fid.netfid, xid); if (rc == 0) { - cFYI(1, "posix reopen succeeded"); + cifs_dbg(FYI, "posix reopen succeeded\n"); goto reopen_success; } /* @@ -668,8 +668,8 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) NULL, cifs_sb); if (rc) { mutex_unlock(&cfile->fh_mutex); - cFYI(1, "cifs_reopen returned 0x%x", rc); - cFYI(1, "oplock: %d", oplock); + cifs_dbg(FYI, "cifs_reopen returned 0x%x\n", rc); + cifs_dbg(FYI, "oplock: %d\n", oplock); goto reopen_error_exit; } @@ -725,7 +725,7 @@ int cifs_closedir(struct inode *inode, struct file *file) struct TCP_Server_Info *server; char *buf; - cFYI(1, "Closedir inode = 0x%p", inode); + cifs_dbg(FYI, "Closedir inode = 0x%p\n", inode); if (cfile == NULL) return rc; @@ -734,7 +734,7 @@ int cifs_closedir(struct inode *inode, struct file *file) tcon = tlink_tcon(cfile->tlink); server = tcon->ses->server; - cFYI(1, "Freeing private data in close dir"); + cifs_dbg(FYI, "Freeing private data in close dir\n"); spin_lock(&cifs_file_list_lock); if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) { cfile->invalidHandle = true; @@ -743,7 +743,7 @@ int cifs_closedir(struct inode *inode, struct file *file) rc = server->ops->close_dir(xid, tcon, &cfile->fid); else rc = -ENOSYS; - cFYI(1, "Closing uncompleted readdir with rc %d", rc); + cifs_dbg(FYI, "Closing uncompleted readdir with rc %d\n", rc); /* not much we can do if it fails anyway, ignore rc */ rc = 0; } else @@ -751,7 +751,7 @@ int cifs_closedir(struct inode *inode, struct file *file) buf = cfile->srch_inf.ntwrk_buf_start; if (buf) { - cFYI(1, "closedir free smb buf in srch struct"); + cifs_dbg(FYI, "closedir free smb buf in srch struct\n"); cfile->srch_inf.ntwrk_buf_start = NULL; if (cfile->srch_inf.smallBuf) cifs_small_buf_release(buf); @@ -1136,7 +1136,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) * The list ended. We don't have enough allocated * structures - something is really wrong. */ - cERROR(1, "Can't push all brlocks!"); + cifs_dbg(VFS, "Can't push all brlocks!\n"); break; } length = 1 + flock->fl_end - flock->fl_start; @@ -1209,47 +1209,46 @@ cifs_read_flock(struct file_lock *flock, __u32 *type, int *lock, int *unlock, bool *wait_flag, struct TCP_Server_Info *server) { if (flock->fl_flags & FL_POSIX) - cFYI(1, "Posix"); + cifs_dbg(FYI, "Posix\n"); if (flock->fl_flags & FL_FLOCK) - cFYI(1, "Flock"); + cifs_dbg(FYI, "Flock\n"); if (flock->fl_flags & FL_SLEEP) { - cFYI(1, "Blocking lock"); + cifs_dbg(FYI, "Blocking lock\n"); *wait_flag = true; } if (flock->fl_flags & FL_ACCESS) - cFYI(1, "Process suspended by mandatory locking - " - "not implemented yet"); + cifs_dbg(FYI, "Process suspended by mandatory locking - not implemented yet\n"); if (flock->fl_flags & FL_LEASE) - cFYI(1, "Lease on file - not implemented yet"); + cifs_dbg(FYI, "Lease on file - not implemented yet\n"); if (flock->fl_flags & (~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE | FL_CLOSE))) - cFYI(1, "Unknown lock flags 0x%x", flock->fl_flags); + cifs_dbg(FYI, "Unknown lock flags 0x%x\n", flock->fl_flags); *type = server->vals->large_lock_type; if (flock->fl_type == F_WRLCK) { - cFYI(1, "F_WRLCK "); + cifs_dbg(FYI, "F_WRLCK\n"); *type |= server->vals->exclusive_lock_type; *lock = 1; } else if (flock->fl_type == F_UNLCK) { - cFYI(1, "F_UNLCK"); + cifs_dbg(FYI, "F_UNLCK\n"); *type |= server->vals->unlock_lock_type; *unlock = 1; /* Check if unlock includes more than one lock range */ } else if (flock->fl_type == F_RDLCK) { - cFYI(1, "F_RDLCK"); + cifs_dbg(FYI, "F_RDLCK\n"); *type |= server->vals->shared_lock_type; *lock = 1; } else if (flock->fl_type == F_EXLCK) { - cFYI(1, "F_EXLCK"); + cifs_dbg(FYI, "F_EXLCK\n"); *type |= server->vals->exclusive_lock_type; *lock = 1; } else if (flock->fl_type == F_SHLCK) { - cFYI(1, "F_SHLCK"); + cifs_dbg(FYI, "F_SHLCK\n"); *type |= server->vals->shared_lock_type; *lock = 1; } else - cFYI(1, "Unknown type of lock"); + cifs_dbg(FYI, "Unknown type of lock\n"); } static int @@ -1292,8 +1291,8 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type, type, 0, 1, false); flock->fl_type = F_UNLCK; if (rc != 0) - cERROR(1, "Error unlocking previously locked " - "range %d during test of lock", rc); + cifs_dbg(VFS, "Error unlocking previously locked range %d during test of lock\n", + rc); return 0; } @@ -1312,8 +1311,8 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type, type | server->vals->shared_lock_type, 0, 1, false); flock->fl_type = F_RDLCK; if (rc != 0) - cERROR(1, "Error unlocking previously locked " - "range %d during test of lock", rc); + cifs_dbg(VFS, "Error unlocking previously locked range %d during test of lock\n", + rc); } else flock->fl_type = F_WRLCK; @@ -1504,8 +1503,8 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, if (!CIFS_I(inode)->clientCanCacheAll && CIFS_I(inode)->clientCanCacheRead) { cifs_invalidate_mapping(inode); - cFYI(1, "Set no oplock for inode=%p due to mand locks", - inode); + cifs_dbg(FYI, "Set no oplock for inode=%p due to mand locks\n", + inode); CIFS_I(inode)->clientCanCacheRead = false; } @@ -1542,9 +1541,9 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock) rc = -EACCES; xid = get_xid(); - cFYI(1, "Lock parm: 0x%x flockflags: 0x%x flocktype: 0x%x start: %lld " - "end: %lld", cmd, flock->fl_flags, flock->fl_type, - flock->fl_start, flock->fl_end); + cifs_dbg(FYI, "Lock parm: 0x%x flockflags: 0x%x flocktype: 0x%x start: %lld end: %lld\n", + cmd, flock->fl_flags, flock->fl_type, + flock->fl_start, flock->fl_end); cfile = (struct cifsFileInfo *)file->private_data; tcon = tlink_tcon(cfile->tlink); @@ -1616,8 +1615,8 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data, cifs_sb = CIFS_SB(dentry->d_sb); - cFYI(1, "write %zd bytes to offset %lld of %s", write_size, - *offset, dentry->d_name.name); + cifs_dbg(FYI, "write %zd bytes to offset %lld of %s\n", + write_size, *offset, dentry->d_name.name); tcon = tlink_tcon(open_file->tlink); server = tcon->ses->server; @@ -1732,7 +1731,7 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode, it being zero) during stress testcases so we need to check for it */ if (cifs_inode == NULL) { - cERROR(1, "Null inode passed to cifs_writeable_file"); + cifs_dbg(VFS, "Null inode passed to cifs_writeable_file\n"); dump_stack(); return NULL; } @@ -1844,7 +1843,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) else if (bytes_written < 0) rc = bytes_written; } else { - cFYI(1, "No writeable filehandles for inode"); + cifs_dbg(FYI, "No writeable filehandles for inode\n"); rc = -EIO; } @@ -2011,7 +2010,7 @@ retry: wdata->cfile = find_writable_file(CIFS_I(mapping->host), false); if (!wdata->cfile) { - cERROR(1, "No writable handles for inode"); + cifs_dbg(VFS, "No writable handles for inode\n"); rc = -EBADF; break; } @@ -2072,7 +2071,7 @@ cifs_writepage_locked(struct page *page, struct writeback_control *wbc) /* BB add check for wbc flags */ page_cache_get(page); if (!PageUptodate(page)) - cFYI(1, "ppw - page not up to date"); + cifs_dbg(FYI, "ppw - page not up to date\n"); /* * Set the "writeback" flag, and clear "dirty" in the radix tree. @@ -2123,7 +2122,7 @@ static int cifs_write_end(struct file *file, struct address_space *mapping, else pid = current->tgid; - cFYI(1, "write_end for page %p from pos %lld with %d bytes", + cifs_dbg(FYI, "write_end for page %p from pos %lld with %d bytes\n", page, pos, copied); if (PageChecked(page)) { @@ -2187,13 +2186,13 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end, xid = get_xid(); - cFYI(1, "Sync file - name: %s datasync: 0x%x", - file->f_path.dentry->d_name.name, datasync); + cifs_dbg(FYI, "Sync file - name: %s datasync: 0x%x\n", + file->f_path.dentry->d_name.name, datasync); if (!CIFS_I(inode)->clientCanCacheRead) { rc = cifs_invalidate_mapping(inode); if (rc) { - cFYI(1, "rc: %d during invalidate phase", rc); + cifs_dbg(FYI, "rc: %d during invalidate phase\n", rc); rc = 0; /* don't care about it in fsync */ } } @@ -2229,8 +2228,8 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) xid = get_xid(); - cFYI(1, "Sync file - name: %s datasync: 0x%x", - file->f_path.dentry->d_name.name, datasync); + cifs_dbg(FYI, "Sync file - name: %s datasync: 0x%x\n", + file->f_path.dentry->d_name.name, datasync); tcon = tlink_tcon(smbfile->tlink); if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) { @@ -2258,7 +2257,7 @@ int cifs_flush(struct file *file, fl_owner_t id) if (file->f_mode & FMODE_WRITE) rc = filemap_write_and_wait(inode->i_mapping); - cFYI(1, "Flush inode %p file %p rc %d", inode, file, rc); + cifs_dbg(FYI, "Flush inode %p file %p rc %d\n", inode, file, rc); return rc; } @@ -2516,8 +2515,6 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov, BUG_ON(iocb->ki_pos != pos); - sb_start_write(inode->i_sb); - /* * We need to hold the sem to be sure nobody modifies lock list * with a brlock that prevents writing. @@ -2541,7 +2538,6 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov, } up_read(&cinode->lock_sem); - sb_end_write(inode->i_sb); return rc; } @@ -2578,8 +2574,8 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, * an old data. */ cifs_invalidate_mapping(inode); - cFYI(1, "Set no oplock for inode=%p after a write operation", - inode); + cifs_dbg(FYI, "Set no oplock for inode=%p after a write operation\n", + inode); cinode->clientCanCacheRead = false; } return written; @@ -2755,15 +2751,15 @@ cifs_uncached_read_into_pages(struct TCP_Server_Info *server, /* enough data to fill the page */ iov.iov_base = kmap(page); iov.iov_len = PAGE_SIZE; - cFYI(1, "%u: iov_base=%p iov_len=%zu", - i, iov.iov_base, iov.iov_len); + cifs_dbg(FYI, "%u: iov_base=%p iov_len=%zu\n", + i, iov.iov_base, iov.iov_len); len -= PAGE_SIZE; } else if (len > 0) { /* enough for partial page, fill and zero the rest */ iov.iov_base = kmap(page); iov.iov_len = len; - cFYI(1, "%u: iov_base=%p iov_len=%zu", - i, iov.iov_base, iov.iov_len); + cifs_dbg(FYI, "%u: iov_base=%p iov_len=%zu\n", + i, iov.iov_base, iov.iov_len); memset(iov.iov_base + len, '\0', PAGE_SIZE - len); rdata->tailsz = len; len = 0; @@ -2823,7 +2819,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov, pid = current->tgid; if ((file->f_flags & O_ACCMODE) == O_WRONLY) - cFYI(1, "attempting read on write only file instance"); + cifs_dbg(FYI, "attempting read on write only file instance\n"); do { cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize); @@ -3002,7 +2998,7 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset) pid = current->tgid; if ((file->f_flags & O_ACCMODE) == O_WRONLY) - cFYI(1, "attempting read on write only file instance"); + cifs_dbg(FYI, "attempting read on write only file instance\n"); for (total_read = 0, cur_offset = read_data; read_size > total_read; total_read += bytes_read, cur_offset += bytes_read) { @@ -3093,7 +3089,8 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma) xid = get_xid(); rc = cifs_revalidate_file(file); if (rc) { - cFYI(1, "Validation prior to mmap failed, error=%d", rc); + cifs_dbg(FYI, "Validation prior to mmap failed, error=%d\n", + rc); free_xid(xid); return rc; } @@ -3146,7 +3143,7 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server, /* determine the eof that the server (probably) has */ eof = CIFS_I(rdata->mapping->host)->server_eof; eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0; - cFYI(1, "eof=%llu eof_index=%lu", eof, eof_index); + cifs_dbg(FYI, "eof=%llu eof_index=%lu\n", eof, eof_index); rdata->tailsz = PAGE_CACHE_SIZE; for (i = 0; i < nr_pages; i++) { @@ -3156,15 +3153,15 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server, /* enough data to fill the page */ iov.iov_base = kmap(page); iov.iov_len = PAGE_CACHE_SIZE; - cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu", - i, page->index, iov.iov_base, iov.iov_len); + cifs_dbg(FYI, "%u: idx=%lu iov_base=%p iov_len=%zu\n", + i, page->index, iov.iov_base, iov.iov_len); len -= PAGE_CACHE_SIZE; } else if (len > 0) { /* enough for partial page, fill and zero the rest */ iov.iov_base = kmap(page); iov.iov_len = len; - cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu", - i, page->index, iov.iov_base, iov.iov_len); + cifs_dbg(FYI, "%u: idx=%lu iov_base=%p iov_len=%zu\n", + i, page->index, iov.iov_base, iov.iov_len); memset(iov.iov_base + len, '\0', PAGE_CACHE_SIZE - len); rdata->tailsz = len; @@ -3244,8 +3241,8 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, rc = 0; INIT_LIST_HEAD(&tmplist); - cFYI(1, "%s: file=%p mapping=%p num_pages=%u", __func__, file, - mapping, num_pages); + cifs_dbg(FYI, "%s: file=%p mapping=%p num_pages=%u\n", + __func__, file, mapping, num_pages); /* * Start with the page at end of list and move it to private @@ -3375,7 +3372,7 @@ static int cifs_readpage_worker(struct file *file, struct page *page, if (rc < 0) goto io_error; else - cFYI(1, "Bytes read %d", rc); + cifs_dbg(FYI, "Bytes read %d\n", rc); file_inode(file)->i_atime = current_fs_time(file_inode(file)->i_sb); @@ -3413,7 +3410,7 @@ static int cifs_readpage(struct file *file, struct page *page) return rc; } - cFYI(1, "readpage %p at offset %d 0x%x", + cifs_dbg(FYI, "readpage %p at offset %d 0x%x\n", page, (int)offset, (int)offset); rc = cifs_readpage_worker(file, page, &offset); @@ -3480,7 +3477,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping, struct page *page; int rc = 0; - cFYI(1, "write_begin from %lld len %d", (long long)pos, len); + cifs_dbg(FYI, "write_begin from %lld len %d\n", (long long)pos, len); page = grab_cache_page_write_begin(mapping, index, flags); if (!page) { @@ -3569,7 +3566,7 @@ static int cifs_launder_page(struct page *page) .range_end = range_end, }; - cFYI(1, "Launder page: %p", page); + cifs_dbg(FYI, "Launder page: %p\n", page); if (clear_page_dirty_for_io(page)) rc = cifs_writepage_locked(page, &wbc); @@ -3589,8 +3586,8 @@ void cifs_oplock_break(struct work_struct *work) if (!cinode->clientCanCacheAll && cinode->clientCanCacheRead && cifs_has_mand_locks(cinode)) { - cFYI(1, "Reset oplock to None for inode=%p due to mand locks", - inode); + cifs_dbg(FYI, "Reset oplock to None for inode=%p due to mand locks\n", + inode); cinode->clientCanCacheRead = false; } @@ -3605,12 +3602,12 @@ void cifs_oplock_break(struct work_struct *work) mapping_set_error(inode->i_mapping, rc); cifs_invalidate_mapping(inode); } - cFYI(1, "Oplock flush inode %p rc %d", inode, rc); + cifs_dbg(FYI, "Oplock flush inode %p rc %d\n", inode, rc); } rc = cifs_push_locks(cfile); if (rc) - cERROR(1, "Push locks rc = %d", rc); + cifs_dbg(VFS, "Push locks rc = %d\n", rc); /* * releasing stale oplock after recent reconnect of smb session using @@ -3621,7 +3618,7 @@ void cifs_oplock_break(struct work_struct *work) if (!cfile->oplock_break_cancelled) { rc = tcon->ses->server->ops->oplock_response(tcon, &cfile->fid, cinode); - cFYI(1, "Oplock release rc = %d", rc); + cifs_dbg(FYI, "Oplock release rc = %d\n", rc); } } diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index 42e5363b4102..2f4bc5a58054 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -28,14 +28,14 @@ void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) server->fscache = fscache_acquire_cookie(cifs_fscache_netfs.primary_index, &cifs_fscache_server_index_def, server); - cFYI(1, "%s: (0x%p/0x%p)", __func__, server, - server->fscache); + cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", + __func__, server, server->fscache); } void cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) { - cFYI(1, "%s: (0x%p/0x%p)", __func__, server, - server->fscache); + cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", + __func__, server, server->fscache); fscache_relinquish_cookie(server->fscache, 0); server->fscache = NULL; } @@ -47,13 +47,13 @@ void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) tcon->fscache = fscache_acquire_cookie(server->fscache, &cifs_fscache_super_index_def, tcon); - cFYI(1, "%s: (0x%p/0x%p)", __func__, server->fscache, - tcon->fscache); + cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", + __func__, server->fscache, tcon->fscache); } void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon) { - cFYI(1, "%s: (0x%p)", __func__, tcon->fscache); + cifs_dbg(FYI, "%s: (0x%p)\n", __func__, tcon->fscache); fscache_relinquish_cookie(tcon->fscache, 0); tcon->fscache = NULL; } @@ -70,8 +70,8 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) { cifsi->fscache = fscache_acquire_cookie(tcon->fscache, &cifs_fscache_inode_object_def, cifsi); - cFYI(1, "%s: got FH cookie (0x%p/0x%p)", __func__, - tcon->fscache, cifsi->fscache); + cifs_dbg(FYI, "%s: got FH cookie (0x%p/0x%p)\n", + __func__, tcon->fscache, cifsi->fscache); } } @@ -80,7 +80,7 @@ void cifs_fscache_release_inode_cookie(struct inode *inode) struct cifsInodeInfo *cifsi = CIFS_I(inode); if (cifsi->fscache) { - cFYI(1, "%s: (0x%p)", __func__, cifsi->fscache); + cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache); fscache_relinquish_cookie(cifsi->fscache, 0); cifsi->fscache = NULL; } @@ -91,7 +91,7 @@ static void cifs_fscache_disable_inode_cookie(struct inode *inode) struct cifsInodeInfo *cifsi = CIFS_I(inode); if (cifsi->fscache) { - cFYI(1, "%s: (0x%p)", __func__, cifsi->fscache); + cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache); fscache_uncache_all_inode_pages(cifsi->fscache, inode); fscache_relinquish_cookie(cifsi->fscache, 1); cifsi->fscache = NULL; @@ -120,8 +120,8 @@ void cifs_fscache_reset_inode_cookie(struct inode *inode) cifs_sb_master_tcon(cifs_sb)->fscache, &cifs_fscache_inode_object_def, cifsi); - cFYI(1, "%s: new cookie 0x%p oldcookie 0x%p", - __func__, cifsi->fscache, old); + cifs_dbg(FYI, "%s: new cookie 0x%p oldcookie 0x%p\n", + __func__, cifsi->fscache, old); } } @@ -131,8 +131,8 @@ int cifs_fscache_release_page(struct page *page, gfp_t gfp) struct inode *inode = page->mapping->host; struct cifsInodeInfo *cifsi = CIFS_I(inode); - cFYI(1, "%s: (0x%p/0x%p)", __func__, page, - cifsi->fscache); + cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", + __func__, page, cifsi->fscache); if (!fscache_maybe_release_page(cifsi->fscache, page, gfp)) return 0; } @@ -143,7 +143,7 @@ int cifs_fscache_release_page(struct page *page, gfp_t gfp) static void cifs_readpage_from_fscache_complete(struct page *page, void *ctx, int error) { - cFYI(1, "%s: (0x%p/%d)", __func__, page, error); + cifs_dbg(FYI, "%s: (0x%p/%d)\n", __func__, page, error); if (!error) SetPageUptodate(page); unlock_page(page); @@ -156,8 +156,8 @@ int __cifs_readpage_from_fscache(struct inode *inode, struct page *page) { int ret; - cFYI(1, "%s: (fsc:%p, p:%p, i:0x%p", __func__, - CIFS_I(inode)->fscache, page, inode); + cifs_dbg(FYI, "%s: (fsc:%p, p:%p, i:0x%p\n", + __func__, CIFS_I(inode)->fscache, page, inode); ret = fscache_read_or_alloc_page(CIFS_I(inode)->fscache, page, cifs_readpage_from_fscache_complete, NULL, @@ -165,15 +165,15 @@ int __cifs_readpage_from_fscache(struct inode *inode, struct page *page) switch (ret) { case 0: /* page found in fscache, read submitted */ - cFYI(1, "%s: submitted", __func__); + cifs_dbg(FYI, "%s: submitted\n", __func__); return ret; case -ENOBUFS: /* page won't be cached */ case -ENODATA: /* page not in cache */ - cFYI(1, "%s: %d", __func__, ret); + cifs_dbg(FYI, "%s: %d\n", __func__, ret); return 1; default: - cERROR(1, "unknown error ret = %d", ret); + cifs_dbg(VFS, "unknown error ret = %d\n", ret); } return ret; } @@ -188,8 +188,8 @@ int __cifs_readpages_from_fscache(struct inode *inode, { int ret; - cFYI(1, "%s: (0x%p/%u/0x%p)", __func__, - CIFS_I(inode)->fscache, *nr_pages, inode); + cifs_dbg(FYI, "%s: (0x%p/%u/0x%p)\n", + __func__, CIFS_I(inode)->fscache, *nr_pages, inode); ret = fscache_read_or_alloc_pages(CIFS_I(inode)->fscache, mapping, pages, nr_pages, cifs_readpage_from_fscache_complete, @@ -197,16 +197,16 @@ int __cifs_readpages_from_fscache(struct inode *inode, mapping_gfp_mask(mapping)); switch (ret) { case 0: /* read submitted to the cache for all pages */ - cFYI(1, "%s: submitted", __func__); + cifs_dbg(FYI, "%s: submitted\n", __func__); return ret; case -ENOBUFS: /* some pages are not cached and can't be */ case -ENODATA: /* some pages are not cached */ - cFYI(1, "%s: no page", __func__); + cifs_dbg(FYI, "%s: no page\n", __func__); return 1; default: - cFYI(1, "unknown error ret = %d", ret); + cifs_dbg(FYI, "unknown error ret = %d\n", ret); } return ret; @@ -216,8 +216,8 @@ void __cifs_readpage_to_fscache(struct inode *inode, struct page *page) { int ret; - cFYI(1, "%s: (fsc: %p, p: %p, i: %p)", __func__, - CIFS_I(inode)->fscache, page, inode); + cifs_dbg(FYI, "%s: (fsc: %p, p: %p, i: %p)\n", + __func__, CIFS_I(inode)->fscache, page, inode); ret = fscache_write_page(CIFS_I(inode)->fscache, page, GFP_KERNEL); if (ret != 0) fscache_uncache_page(CIFS_I(inode)->fscache, page); @@ -228,7 +228,7 @@ void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode) struct cifsInodeInfo *cifsi = CIFS_I(inode); struct fscache_cookie *cookie = cifsi->fscache; - cFYI(1, "%s: (0x%p/0x%p)", __func__, page, cookie); + cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, page, cookie); fscache_wait_on_page_write(cookie, page); fscache_uncache_page(cookie, page); } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 83f2606c76d0..fc3025199cb3 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -91,30 +91,32 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) { struct cifsInodeInfo *cifs_i = CIFS_I(inode); - cFYI(1, "%s: revalidating inode %llu", __func__, cifs_i->uniqueid); + cifs_dbg(FYI, "%s: revalidating inode %llu\n", + __func__, cifs_i->uniqueid); if (inode->i_state & I_NEW) { - cFYI(1, "%s: inode %llu is new", __func__, cifs_i->uniqueid); + cifs_dbg(FYI, "%s: inode %llu is new\n", + __func__, cifs_i->uniqueid); return; } /* don't bother with revalidation if we have an oplock */ if (cifs_i->clientCanCacheRead) { - cFYI(1, "%s: inode %llu is oplocked", __func__, - cifs_i->uniqueid); + cifs_dbg(FYI, "%s: inode %llu is oplocked\n", + __func__, cifs_i->uniqueid); return; } /* revalidate if mtime or size have changed */ if (timespec_equal(&inode->i_mtime, &fattr->cf_mtime) && cifs_i->server_eof == fattr->cf_eof) { - cFYI(1, "%s: inode %llu is unchanged", __func__, - cifs_i->uniqueid); + cifs_dbg(FYI, "%s: inode %llu is unchanged\n", + __func__, cifs_i->uniqueid); return; } - cFYI(1, "%s: invalidating inode %llu mapping", __func__, - cifs_i->uniqueid); + cifs_dbg(FYI, "%s: invalidating inode %llu mapping\n", + __func__, cifs_i->uniqueid); cifs_i->invalid_mapping = true; } @@ -240,7 +242,7 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info, /* safest to call it a file if we do not know */ fattr->cf_mode |= S_IFREG; fattr->cf_dtype = DT_REG; - cFYI(1, "unknown type %d", le32_to_cpu(info->Type)); + cifs_dbg(FYI, "unknown type %d\n", le32_to_cpu(info->Type)); break; } @@ -279,7 +281,7 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb) { struct cifs_sb_info *cifs_sb = CIFS_SB(sb); - cFYI(1, "creating fake fattr for DFS referral"); + cifs_dbg(FYI, "creating fake fattr for DFS referral\n"); memset(fattr, 0, sizeof(*fattr)); fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU; @@ -329,7 +331,7 @@ int cifs_get_inode_info_unix(struct inode **pinode, struct tcon_link *tlink; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); - cFYI(1, "Getting info on %s", full_path); + cifs_dbg(FYI, "Getting info on %s\n", full_path); tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) @@ -355,7 +357,7 @@ int cifs_get_inode_info_unix(struct inode **pinode, if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { int tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid); if (tmprc) - cFYI(1, "CIFSCheckMFSymlink: %d", tmprc); + cifs_dbg(FYI, "CIFSCheckMFSymlink: %d\n", tmprc); } if (*pinode == NULL) { @@ -422,7 +424,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path, &buf_type); if ((rc == 0) && (bytes_read >= 8)) { if (memcmp("IntxBLK", pbuf, 8) == 0) { - cFYI(1, "Block device"); + cifs_dbg(FYI, "Block device\n"); fattr->cf_mode |= S_IFBLK; fattr->cf_dtype = DT_BLK; if (bytes_read == 24) { @@ -434,7 +436,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path, fattr->cf_rdev = MKDEV(mjr, mnr); } } else if (memcmp("IntxCHR", pbuf, 8) == 0) { - cFYI(1, "Char device"); + cifs_dbg(FYI, "Char device\n"); fattr->cf_mode |= S_IFCHR; fattr->cf_dtype = DT_CHR; if (bytes_read == 24) { @@ -446,7 +448,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path, fattr->cf_rdev = MKDEV(mjr, mnr); } } else if (memcmp("IntxLNK", pbuf, 7) == 0) { - cFYI(1, "Symlink"); + cifs_dbg(FYI, "Symlink\n"); fattr->cf_mode |= S_IFLNK; fattr->cf_dtype = DT_LNK; } else { @@ -497,10 +499,10 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, else if (rc > 3) { mode = le32_to_cpu(*((__le32 *)ea_value)); fattr->cf_mode &= ~SFBITS_MASK; - cFYI(1, "special bits 0%o org mode 0%o", mode, - fattr->cf_mode); + cifs_dbg(FYI, "special bits 0%o org mode 0%o\n", + mode, fattr->cf_mode); fattr->cf_mode = (mode & SFBITS_MASK) | fattr->cf_mode; - cFYI(1, "special mode bits 0%o", mode); + cifs_dbg(FYI, "special mode bits 0%o\n", mode); } return 0; @@ -635,11 +637,11 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, tcon = tlink_tcon(tlink); server = tcon->ses->server; - cFYI(1, "Getting info on %s", full_path); + cifs_dbg(FYI, "Getting info on %s\n", full_path); if ((data == NULL) && (*inode != NULL)) { if (CIFS_I(*inode)->clientCanCacheRead) { - cFYI(1, "No need to revalidate cached inode sizes"); + cifs_dbg(FYI, "No need to revalidate cached inode sizes\n"); goto cgii_exit; } } @@ -714,7 +716,8 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, tcon, cifs_sb, full_path, &fattr.cf_uniqueid, data); if (tmprc) { - cFYI(1, "GetSrvInodeNum rc %d", tmprc); + cifs_dbg(FYI, "GetSrvInodeNum rc %d\n", + tmprc); fattr.cf_uniqueid = iunique(sb, ROOT_I); cifs_autodisable_serverino(cifs_sb); } @@ -729,7 +732,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) { tmprc = cifs_sfu_type(&fattr, full_path, cifs_sb, xid); if (tmprc) - cFYI(1, "cifs_sfu_type failed: %d", tmprc); + cifs_dbg(FYI, "cifs_sfu_type failed: %d\n", tmprc); } #ifdef CONFIG_CIFS_ACL @@ -737,8 +740,8 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, full_path, fid); if (rc) { - cFYI(1, "%s: Getting ACL failed with error: %d", - __func__, rc); + cifs_dbg(FYI, "%s: Getting ACL failed with error: %d\n", + __func__, rc); goto cgii_exit; } } @@ -752,7 +755,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid); if (tmprc) - cFYI(1, "CIFSCheckMFSymlink: %d", tmprc); + cifs_dbg(FYI, "CIFSCheckMFSymlink: %d\n", tmprc); } if (!*inode) { @@ -836,7 +839,7 @@ cifs_iget(struct super_block *sb, struct cifs_fattr *fattr) struct inode *inode; retry_iget5_locked: - cFYI(1, "looking for uniqueid=%llu", fattr->cf_uniqueid); + cifs_dbg(FYI, "looking for uniqueid=%llu\n", fattr->cf_uniqueid); /* hash down to 32-bits on 32-bit arch */ hash = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid); @@ -899,7 +902,7 @@ struct inode *cifs_root_iget(struct super_block *sb) #endif if (rc && tcon->ipc) { - cFYI(1, "ipc connection - fake read inode"); + cifs_dbg(FYI, "ipc connection - fake read inode\n"); spin_lock(&inode->i_lock); inode->i_mode |= S_IFDIR; set_nlink(inode, 2); @@ -958,7 +961,7 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, unsigned int xid, * server times. */ if (set_time && (attrs->ia_valid & ATTR_CTIME)) { - cFYI(1, "CIFS - CTIME changed"); + cifs_dbg(FYI, "CIFS - CTIME changed\n"); info_buf.ChangeTime = cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime)); } else @@ -995,6 +998,15 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, return PTR_ERR(tlink); tcon = tlink_tcon(tlink); + /* + * We cannot rename the file if the server doesn't support + * CAP_INFOLEVEL_PASSTHRU + */ + if (!(tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU)) { + rc = -EBUSY; + goto out; + } + rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN, DELETE|FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR, &netfid, &oplock, NULL, cifs_sb->local_nls, @@ -1023,7 +1035,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, current->tgid); /* although we would like to mark the file hidden if that fails we will still try to rename it */ - if (rc != 0) + if (!rc) cifsInode->cifsAttrs = dosattr; else dosattr = origattr; /* since not able to change them */ @@ -1034,7 +1046,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc != 0) { - rc = -ETXTBSY; + rc = -EBUSY; goto undo_setattr; } @@ -1053,7 +1065,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, if (rc == -ENOENT) rc = 0; else if (rc != 0) { - rc = -ETXTBSY; + rc = -EBUSY; goto undo_rename; } cifsInode->delete_pending = true; @@ -1118,7 +1130,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) struct iattr *attrs = NULL; __u32 dosattr = 0, origattr = 0; - cFYI(1, "cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry); + cifs_dbg(FYI, "cifs_unlink, dir=0x%p, dentry=0x%p\n", dir, dentry); tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) @@ -1141,7 +1153,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) rc = CIFSPOSIXDelFile(xid, tcon, full_path, SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - cFYI(1, "posix del rc %d", rc); + cifs_dbg(FYI, "posix del rc %d\n", rc); if ((rc == 0) || (rc == -ENOENT)) goto psx_del_no_retry; } @@ -1160,15 +1172,13 @@ psx_del_no_retry: cifs_drop_nlink(inode); } else if (rc == -ENOENT) { d_drop(dentry); - } else if (rc == -ETXTBSY) { + } else if (rc == -EBUSY) { if (server->ops->rename_pending_delete) { rc = server->ops->rename_pending_delete(full_path, dentry, xid); if (rc == 0) cifs_drop_nlink(inode); } - if (rc == -ETXTBSY) - rc = -EBUSY; } else if ((rc == -EACCES) && (dosattr == 0) && inode) { attrs = kzalloc(sizeof(*attrs), GFP_KERNEL); if (attrs == NULL) { @@ -1313,7 +1323,7 @@ cifs_posix_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode, if (rc == -EOPNOTSUPP) goto posix_mkdir_out; else if (rc) { - cFYI(1, "posix mkdir returned 0x%x", rc); + cifs_dbg(FYI, "posix mkdir returned 0x%x\n", rc); d_drop(dentry); goto posix_mkdir_out; } @@ -1335,11 +1345,12 @@ cifs_posix_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode, d_instantiate(dentry, newinode); #ifdef CONFIG_CIFS_DEBUG2 - cFYI(1, "instantiated dentry %p %s to inode %p", dentry, - dentry->d_name.name, newinode); + cifs_dbg(FYI, "instantiated dentry %p %s to inode %p\n", + dentry, dentry->d_name.name, newinode); if (newinode->i_nlink != 2) - cFYI(1, "unexpected number of links %d", newinode->i_nlink); + cifs_dbg(FYI, "unexpected number of links %d\n", + newinode->i_nlink); #endif posix_mkdir_out: @@ -1361,7 +1372,8 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode) struct TCP_Server_Info *server; char *full_path; - cFYI(1, "In cifs_mkdir, mode = 0x%hx inode = 0x%p", mode, inode); + cifs_dbg(FYI, "In cifs_mkdir, mode = 0x%hx inode = 0x%p\n", + mode, inode); cifs_sb = CIFS_SB(inode->i_sb); tlink = cifs_sb_tlink(cifs_sb); @@ -1395,7 +1407,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode) /* BB add setting the equivalent of mode via CreateX w/ACLs */ rc = server->ops->mkdir(xid, tcon, full_path, cifs_sb); if (rc) { - cFYI(1, "cifs_mkdir returned 0x%x", rc); + cifs_dbg(FYI, "cifs_mkdir returned 0x%x\n", rc); d_drop(direntry); goto mkdir_out; } @@ -1425,7 +1437,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) char *full_path = NULL; struct cifsInodeInfo *cifsInode; - cFYI(1, "cifs_rmdir, inode = 0x%p", inode); + cifs_dbg(FYI, "cifs_rmdir, inode = 0x%p\n", inode); xid = get_xid(); @@ -1509,7 +1521,7 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, * source. Note that cross directory moves do not work with * rename by filehandle to various Windows servers. */ - if (rc == 0 || rc != -ETXTBSY) + if (rc == 0 || rc != -EBUSY) goto do_rename_exit; /* open-file renames don't work across directories */ @@ -1674,8 +1686,8 @@ cifs_invalidate_mapping(struct inode *inode) if (inode->i_mapping && inode->i_mapping->nrpages != 0) { rc = invalidate_inode_pages2(inode->i_mapping); if (rc) { - cERROR(1, "%s: could not invalidate inode %p", __func__, - inode); + cifs_dbg(VFS, "%s: could not invalidate inode %p\n", + __func__, inode); cifs_i->invalid_mapping = true; } } @@ -1725,8 +1737,8 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry) goto out; } - cFYI(1, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time " - "%ld jiffies %ld", full_path, inode, inode->i_count.counter, + cifs_dbg(FYI, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time %ld jiffies %ld\n", + full_path, inode, inode->i_count.counter, dentry, dentry->d_time, jiffies); if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext) @@ -1876,7 +1888,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs, else rc = -ENOSYS; cifsFileInfo_put(open_file); - cFYI(1, "SetFSize for attrs rc = %d", rc); + cifs_dbg(FYI, "SetFSize for attrs rc = %d\n", rc); if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { unsigned int bytes_written; @@ -1887,7 +1899,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs, io_parms.length = attrs->ia_size; rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, NULL, NULL, 1); - cFYI(1, "Wrt seteof rc %d", rc); + cifs_dbg(FYI, "Wrt seteof rc %d\n", rc); } } else rc = -EINVAL; @@ -1913,7 +1925,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs, attrs->ia_size, cifs_sb, false); else rc = -ENOSYS; - cFYI(1, "SetEOF by path (setattrs) rc = %d", rc); + cifs_dbg(FYI, "SetEOF by path (setattrs) rc = %d\n", rc); if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { __u16 netfid; int oplock = 0; @@ -1933,7 +1945,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs, io_parms.length = attrs->ia_size; rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, NULL, NULL, 1); - cFYI(1, "wrt seteof rc %d", rc); + cifs_dbg(FYI, "wrt seteof rc %d\n", rc); CIFSSMBClose(xid, tcon, netfid); } } @@ -1964,7 +1976,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) struct cifs_unix_set_info_args *args = NULL; struct cifsFileInfo *open_file; - cFYI(1, "setattr_unix on file %s attrs->ia_valid=0x%x", + cifs_dbg(FYI, "setattr_unix on file %s attrs->ia_valid=0x%x\n", direntry->d_name.name, attrs->ia_valid); xid = get_xid(); @@ -2107,7 +2119,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) xid = get_xid(); - cFYI(1, "setattr on file %s attrs->iavalid 0x%x", + cifs_dbg(FYI, "setattr on file %s attrs->iavalid 0x%x\n", direntry->d_name.name, attrs->ia_valid); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) @@ -2159,8 +2171,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) rc = id_mode_to_cifs_acl(inode, full_path, NO_CHANGE_64, uid, gid); if (rc) { - cFYI(1, "%s: Setting id failed with error: %d", - __func__, rc); + cifs_dbg(FYI, "%s: Setting id failed with error: %d\n", + __func__, rc); goto cifs_setattr_exit; } } @@ -2181,8 +2193,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) rc = id_mode_to_cifs_acl(inode, full_path, mode, INVALID_UID, INVALID_GID); if (rc) { - cFYI(1, "%s: Setting ACL failed with error: %d", - __func__, rc); + cifs_dbg(FYI, "%s: Setting ACL failed with error: %d\n", + __func__, rc); goto cifs_setattr_exit; } } else @@ -2270,7 +2282,7 @@ cifs_setattr(struct dentry *direntry, struct iattr *attrs) #if 0 void cifs_delete_inode(struct inode *inode) { - cFYI(1, "In cifs_delete_inode, inode = 0x%p", inode); + cifs_dbg(FYI, "In cifs_delete_inode, inode = 0x%p\n", inode); /* may have to add back in if and when safe distributed caching of directories added e.g. via FindNotify */ } diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 6c9f1214cf0b..3e0845585853 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -44,7 +44,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) xid = get_xid(); - cFYI(1, "ioctl file %p cmd %u arg %lu", filep, command, arg); + cifs_dbg(FYI, "ioctl file %p cmd %u arg %lu\n", filep, command, arg); cifs_sb = CIFS_SB(inode->i_sb); @@ -83,11 +83,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) * &ExtAttrMask); */ } - cFYI(1, "set flags not implemented yet"); + cifs_dbg(FYI, "set flags not implemented yet\n"); break; #endif /* CONFIG_CIFS_POSIX */ default: - cFYI(1, "unsupported ioctl"); + cifs_dbg(FYI, "unsupported ioctl\n"); break; } diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 9f6c4c45d21e..b83c3f5646bd 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -56,14 +56,14 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash) md5 = crypto_alloc_shash("md5", 0, 0); if (IS_ERR(md5)) { rc = PTR_ERR(md5); - cERROR(1, "%s: Crypto md5 allocation error %d", __func__, rc); + cifs_dbg(VFS, "%s: Crypto md5 allocation error %d\n", + __func__, rc); return rc; } size = sizeof(struct shash_desc) + crypto_shash_descsize(md5); sdescmd5 = kmalloc(size, GFP_KERNEL); if (!sdescmd5) { rc = -ENOMEM; - cERROR(1, "%s: Memory allocation failure", __func__); goto symlink_hash_err; } sdescmd5->shash.tfm = md5; @@ -71,17 +71,17 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash) rc = crypto_shash_init(&sdescmd5->shash); if (rc) { - cERROR(1, "%s: Could not init md5 shash", __func__); + cifs_dbg(VFS, "%s: Could not init md5 shash\n", __func__); goto symlink_hash_err; } rc = crypto_shash_update(&sdescmd5->shash, link_str, link_len); if (rc) { - cERROR(1, "%s: Could not update with link_str", __func__); + cifs_dbg(VFS, "%s: Could not update with link_str\n", __func__); goto symlink_hash_err; } rc = crypto_shash_final(&sdescmd5->shash, md5_hash); if (rc) - cERROR(1, "%s: Could not generate md5 hash", __func__); + cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); symlink_hash_err: crypto_free_shash(md5); @@ -115,7 +115,7 @@ CIFSParseMFSymlink(const u8 *buf, rc = symlink_hash(link_len, link_str, md5_hash); if (rc) { - cFYI(1, "%s: MD5 hash failure: %d", __func__, rc); + cifs_dbg(FYI, "%s: MD5 hash failure: %d\n", __func__, rc); return rc; } @@ -154,7 +154,7 @@ CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str) rc = symlink_hash(link_len, link_str, md5_hash); if (rc) { - cFYI(1, "%s: MD5 hash failure: %d", __func__, rc); + cifs_dbg(FYI, "%s: MD5 hash failure: %d\n", __func__, rc); return rc; } @@ -521,7 +521,7 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd) if (!full_path) goto out; - cFYI(1, "Full path: %s inode = 0x%p", full_path, inode); + cifs_dbg(FYI, "Full path: %s inode = 0x%p\n", full_path, inode); rc = -EACCES; /* @@ -578,8 +578,8 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname) goto symlink_exit; } - cFYI(1, "Full path: %s", full_path); - cFYI(1, "symname is %s", symname); + cifs_dbg(FYI, "Full path: %s\n", full_path); + cifs_dbg(FYI, "symname is %s\n", symname); /* BB what if DFS and this volume is on different share? BB */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) @@ -601,8 +601,8 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname) inode->i_sb, xid, NULL); if (rc != 0) { - cFYI(1, "Create symlink ok, getinodeinfo fail rc = %d", - rc); + cifs_dbg(FYI, "Create symlink ok, getinodeinfo fail rc = %d\n", + rc); } else { d_instantiate(direntry, newinode); } diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 1b15bf839f37..1bec014779fd 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -54,7 +54,7 @@ _get_xid(void) if (GlobalTotalActiveXid > GlobalMaxActiveXid) GlobalMaxActiveXid = GlobalTotalActiveXid; if (GlobalTotalActiveXid > 65000) - cFYI(1, "warning: more than 65000 requests active"); + cifs_dbg(FYI, "warning: more than 65000 requests active\n"); xid = GlobalCurrentXid++; spin_unlock(&GlobalMid_Lock); return xid; @@ -91,7 +91,7 @@ void sesInfoFree(struct cifs_ses *buf_to_free) { if (buf_to_free == NULL) { - cFYI(1, "Null buffer passed to sesInfoFree"); + cifs_dbg(FYI, "Null buffer passed to sesInfoFree\n"); return; } @@ -130,7 +130,7 @@ void tconInfoFree(struct cifs_tcon *buf_to_free) { if (buf_to_free == NULL) { - cFYI(1, "Null buffer passed to tconInfoFree"); + cifs_dbg(FYI, "Null buffer passed to tconInfoFree\n"); return; } atomic_dec(&tconInfoAllocCount); @@ -180,7 +180,7 @@ void cifs_buf_release(void *buf_to_free) { if (buf_to_free == NULL) { - /* cFYI(1, "Null buffer passed to cifs_buf_release");*/ + /* cifs_dbg(FYI, "Null buffer passed to cifs_buf_release\n");*/ return; } mempool_free(buf_to_free, cifs_req_poolp); @@ -216,7 +216,7 @@ cifs_small_buf_release(void *buf_to_free) { if (buf_to_free == NULL) { - cFYI(1, "Null buffer passed to cifs_small_buf_release"); + cifs_dbg(FYI, "Null buffer passed to cifs_small_buf_release\n"); return; } mempool_free(buf_to_free, cifs_sm_req_poolp); @@ -282,15 +282,15 @@ check_smb_hdr(struct smb_hdr *smb, __u16 mid) { /* does it have the right SMB "signature" ? */ if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff)) { - cERROR(1, "Bad protocol string signature header 0x%x", - *(unsigned int *)smb->Protocol); + cifs_dbg(VFS, "Bad protocol string signature header 0x%x\n", + *(unsigned int *)smb->Protocol); return 1; } /* Make sure that message ids match */ if (mid != smb->Mid) { - cERROR(1, "Mids do not match. received=%u expected=%u", - smb->Mid, mid); + cifs_dbg(VFS, "Mids do not match. received=%u expected=%u\n", + smb->Mid, mid); return 1; } @@ -302,7 +302,7 @@ check_smb_hdr(struct smb_hdr *smb, __u16 mid) if (smb->Command == SMB_COM_LOCKING_ANDX) return 0; - cERROR(1, "Server sent request, not response. mid=%u", smb->Mid); + cifs_dbg(VFS, "Server sent request, not response. mid=%u\n", smb->Mid); return 1; } @@ -313,8 +313,8 @@ checkSMB(char *buf, unsigned int total_read) __u16 mid = smb->Mid; __u32 rfclen = be32_to_cpu(smb->smb_buf_length); __u32 clc_len; /* calculated length */ - cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x", - total_read, rfclen); + cifs_dbg(FYI, "checkSMB Length: 0x%x, smb_buf_length: 0x%x\n", + total_read, rfclen); /* is this frame too small to even get to a BCC? */ if (total_read < 2 + sizeof(struct smb_hdr)) { @@ -340,9 +340,9 @@ checkSMB(char *buf, unsigned int total_read) tmp[sizeof(struct smb_hdr)+1] = 0; return 0; } - cERROR(1, "rcvd invalid byte count (bcc)"); + cifs_dbg(VFS, "rcvd invalid byte count (bcc)\n"); } else { - cERROR(1, "Length less than smb header size"); + cifs_dbg(VFS, "Length less than smb header size\n"); } return -EIO; } @@ -353,8 +353,8 @@ checkSMB(char *buf, unsigned int total_read) clc_len = smbCalcSize(smb); if (4 + rfclen != total_read) { - cERROR(1, "Length read does not match RFC1001 length %d", - rfclen); + cifs_dbg(VFS, "Length read does not match RFC1001 length %d\n", + rfclen); return -EIO; } @@ -365,12 +365,12 @@ checkSMB(char *buf, unsigned int total_read) if (((4 + rfclen) & 0xFFFF) == (clc_len & 0xFFFF)) return 0; /* bcc wrapped */ } - cFYI(1, "Calculated size %u vs length %u mismatch for mid=%u", - clc_len, 4 + rfclen, smb->Mid); + cifs_dbg(FYI, "Calculated size %u vs length %u mismatch for mid=%u\n", + clc_len, 4 + rfclen, smb->Mid); if (4 + rfclen < clc_len) { - cERROR(1, "RFC1001 size %u smaller than SMB for mid=%u", - rfclen, smb->Mid); + cifs_dbg(VFS, "RFC1001 size %u smaller than SMB for mid=%u\n", + rfclen, smb->Mid); return -EIO; } else if (rfclen > clc_len + 512) { /* @@ -382,8 +382,8 @@ checkSMB(char *buf, unsigned int total_read) * trailing data, we choose limit the amount of extra * data to 512 bytes. */ - cERROR(1, "RFC1001 size %u more than 512 bytes larger " - "than SMB for mid=%u", rfclen, smb->Mid); + cifs_dbg(VFS, "RFC1001 size %u more than 512 bytes larger than SMB for mid=%u\n", + rfclen, smb->Mid); return -EIO; } } @@ -401,7 +401,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) struct cifsInodeInfo *pCifsInode; struct cifsFileInfo *netfile; - cFYI(1, "Checking for oplock break or dnotify response"); + cifs_dbg(FYI, "Checking for oplock break or dnotify response\n"); if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) && (pSMB->hdr.Flags & SMBFLG_RESPONSE)) { struct smb_com_transaction_change_notify_rsp *pSMBr = @@ -413,15 +413,15 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) pnotify = (struct file_notify_information *) ((char *)&pSMBr->hdr.Protocol + data_offset); - cFYI(1, "dnotify on %s Action: 0x%x", + cifs_dbg(FYI, "dnotify on %s Action: 0x%x\n", pnotify->FileName, pnotify->Action); /* cifs_dump_mem("Rcvd notify Data: ",buf, sizeof(struct smb_hdr)+60); */ return true; } if (pSMBr->hdr.Status.CifsError) { - cFYI(1, "notify err 0x%d", - pSMBr->hdr.Status.CifsError); + cifs_dbg(FYI, "notify err 0x%d\n", + pSMBr->hdr.Status.CifsError); return true; } return false; @@ -435,7 +435,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) large dirty files cached on the client */ if ((NT_STATUS_INVALID_HANDLE) == le32_to_cpu(pSMB->hdr.Status.CifsError)) { - cFYI(1, "invalid handle on oplock break"); + cifs_dbg(FYI, "invalid handle on oplock break\n"); return true; } else if (ERRbadfid == le16_to_cpu(pSMB->hdr.Status.DosError.Error)) { @@ -447,7 +447,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) if (pSMB->hdr.WordCount != 8) return false; - cFYI(1, "oplock type 0x%d level 0x%d", + cifs_dbg(FYI, "oplock type 0x%d level 0x%d\n", pSMB->LockType, pSMB->OplockLevel); if (!(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE)) return false; @@ -469,7 +469,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) if (pSMB->Fid != netfile->fid.netfid) continue; - cFYI(1, "file id match, oplock break"); + cifs_dbg(FYI, "file id match, oplock break\n"); pCifsInode = CIFS_I(netfile->dentry->d_inode); cifs_set_oplock_level(pCifsInode, @@ -484,12 +484,12 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) } spin_unlock(&cifs_file_list_lock); spin_unlock(&cifs_tcp_ses_lock); - cFYI(1, "No matching file for oplock break"); + cifs_dbg(FYI, "No matching file for oplock break\n"); return true; } } spin_unlock(&cifs_tcp_ses_lock); - cFYI(1, "Can not process oplock break for non-existent connection"); + cifs_dbg(FYI, "Can not process oplock break for non-existent connection\n"); return true; } @@ -536,12 +536,8 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM; - cERROR(1, "Autodisabling the use of server inode numbers on " - "%s. This server doesn't seem to support them " - "properly. Hardlinks will not be recognized on this " - "mount. Consider mounting with the \"noserverino\" " - "option to silence this message.", - cifs_sb_master_tcon(cifs_sb)->treeName); + cifs_dbg(VFS, "Autodisabling the use of server inode numbers on %s. This server doesn't seem to support them properly. Hardlinks will not be recognized on this mount. Consider mounting with the \"noserverino\" option to silence this message.\n", + cifs_sb_master_tcon(cifs_sb)->treeName); } } @@ -552,13 +548,13 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock) if (oplock == OPLOCK_EXCLUSIVE) { cinode->clientCanCacheAll = true; cinode->clientCanCacheRead = true; - cFYI(1, "Exclusive Oplock granted on inode %p", - &cinode->vfs_inode); + cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n", + &cinode->vfs_inode); } else if (oplock == OPLOCK_READ) { cinode->clientCanCacheAll = false; cinode->clientCanCacheRead = true; - cFYI(1, "Level II Oplock granted on inode %p", - &cinode->vfs_inode); + cifs_dbg(FYI, "Level II Oplock granted on inode %p\n", + &cinode->vfs_inode); } else { cinode->clientCanCacheAll = false; cinode->clientCanCacheRead = false; diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index a82bc51fdc82..af847e1cf1c1 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -62,7 +62,7 @@ static const struct smb_to_posix_error mapping_table_ERRDOS[] = { {ERRdiffdevice, -EXDEV}, {ERRnofiles, -ENOENT}, {ERRwriteprot, -EROFS}, - {ERRbadshare, -ETXTBSY}, + {ERRbadshare, -EBUSY}, {ERRlock, -EACCES}, {ERRunsup, -EINVAL}, {ERRnosuchshare, -ENXIO}, @@ -150,8 +150,8 @@ cifs_inet_pton(const int address_family, const char *cp, int len, void *dst) else if (address_family == AF_INET6) ret = in6_pton(cp, len, dst , '\\', NULL); - cFYI(DBG2, "address conversion returned %d for %*.*s", - ret, len, len, cp); + cifs_dbg(NOISY, "address conversion returned %d for %*.*s\n", + ret, len, len, cp); if (ret > 0) ret = 1; return ret; @@ -887,7 +887,7 @@ map_smb_to_linux_error(char *buf, bool logErr) } /* else ERRHRD class errors or junk - return EIO */ - cFYI(1, "Mapping smb error code 0x%x to POSIX err %d", + cifs_dbg(FYI, "Mapping smb error code 0x%x to POSIX err %d\n", le32_to_cpu(smb->Status.CifsError), rc); /* generic corrective action e.g. reconnect SMB session on @@ -951,20 +951,20 @@ struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset) SMB_TIME *st = (SMB_TIME *)&time; SMB_DATE *sd = (SMB_DATE *)&date; - cFYI(1, "date %d time %d", date, time); + cifs_dbg(FYI, "date %d time %d\n", date, time); sec = 2 * st->TwoSeconds; min = st->Minutes; if ((sec > 59) || (min > 59)) - cERROR(1, "illegal time min %d sec %d", min, sec); + cifs_dbg(VFS, "illegal time min %d sec %d\n", min, sec); sec += (min * 60); sec += 60 * 60 * st->Hours; if (st->Hours > 24) - cERROR(1, "illegal hours %d", st->Hours); + cifs_dbg(VFS, "illegal hours %d\n", st->Hours); days = sd->Day; month = sd->Month; if ((days > 31) || (month > 12)) { - cERROR(1, "illegal date, month %d day: %d", month, days); + cifs_dbg(VFS, "illegal date, month %d day: %d\n", month, days); if (month > 12) month = 12; } @@ -990,7 +990,7 @@ struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset) ts.tv_sec = sec + offset; - /* cFYI(1, "sec after cnvrt dos to unix time %d",sec); */ + /* cifs_dbg(FYI, "sec after cnvrt dos to unix time %d\n",sec); */ ts.tv_nsec = 0; return ts; diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index df40cc5fd13a..770d5a9781c1 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -48,15 +48,15 @@ static void dump_cifs_file_struct(struct file *file, char *label) if (file) { cf = file->private_data; if (cf == NULL) { - cFYI(1, "empty cifs private file data"); + cifs_dbg(FYI, "empty cifs private file data\n"); return; } if (cf->invalidHandle) - cFYI(1, "invalid handle"); + cifs_dbg(FYI, "invalid handle\n"); if (cf->srch_inf.endOfSearch) - cFYI(1, "end of search"); + cifs_dbg(FYI, "end of search\n"); if (cf->srch_inf.emptyDir) - cFYI(1, "empty dir"); + cifs_dbg(FYI, "empty dir\n"); } } #else @@ -80,7 +80,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, struct super_block *sb = parent->d_inode->i_sb; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); - cFYI(1, "%s: for %s", __func__, name->name); + cifs_dbg(FYI, "%s: for %s\n", __func__, name->name); dentry = d_hash_and_lookup(parent, name); if (unlikely(IS_ERR(dentry))) @@ -233,7 +233,7 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb, fid, cifs_sb->local_nls); if (CIFSSMBClose(xid, ptcon, fid)) { - cFYI(1, "Error closing temporary reparsepoint open"); + cifs_dbg(FYI, "Error closing temporary reparsepoint open\n"); } } } @@ -285,7 +285,7 @@ initiate_cifs_search(const unsigned int xid, struct file *file) goto error_exit; } - cFYI(1, "Full path: %s start at: %lld", full_path, file->f_pos); + cifs_dbg(FYI, "Full path: %s start at: %lld\n", full_path, file->f_pos); ffirst_retry: /* test for Unix extensions */ @@ -336,7 +336,7 @@ static int cifs_unicode_bytelen(const char *str) if (ustr[len] == 0) return len << 1; } - cFYI(1, "Unicode string longer than PATH_MAX found"); + cifs_dbg(FYI, "Unicode string longer than PATH_MAX found\n"); return len << 1; } @@ -353,18 +353,18 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level) pfData->FileNameLength; } else new_entry = old_entry + le32_to_cpu(pDirInfo->NextEntryOffset); - cFYI(1, "new entry %p old entry %p", new_entry, old_entry); + cifs_dbg(FYI, "new entry %p old entry %p\n", new_entry, old_entry); /* validate that new_entry is not past end of SMB */ if (new_entry >= end_of_smb) { - cERROR(1, "search entry %p began after end of SMB %p old entry %p", - new_entry, end_of_smb, old_entry); + cifs_dbg(VFS, "search entry %p began after end of SMB %p old entry %p\n", + new_entry, end_of_smb, old_entry); return NULL; } else if (((level == SMB_FIND_FILE_INFO_STANDARD) && (new_entry + sizeof(FIND_FILE_STANDARD_INFO) > end_of_smb)) || ((level != SMB_FIND_FILE_INFO_STANDARD) && (new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb))) { - cERROR(1, "search entry %p extends after end of SMB %p", - new_entry, end_of_smb); + cifs_dbg(VFS, "search entry %p extends after end of SMB %p\n", + new_entry, end_of_smb); return NULL; } else return new_entry; @@ -457,7 +457,7 @@ static int cifs_fill_dirent(struct cifs_dirent *de, const void *info, cifs_fill_dirent_std(de, info); break; default: - cFYI(1, "Unknown findfirst level %d", level); + cifs_dbg(FYI, "Unknown findfirst level %d\n", level); return -EINVAL; } @@ -572,7 +572,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, if (((index_to_find < cfile->srch_inf.index_of_last_entry) && is_dir_changed(file)) || (index_to_find < first_entry_in_buffer)) { /* close and restart search */ - cFYI(1, "search backing up - close and restart search"); + cifs_dbg(FYI, "search backing up - close and restart search\n"); spin_lock(&cifs_file_list_lock); if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) { cfile->invalidHandle = true; @@ -582,7 +582,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, } else spin_unlock(&cifs_file_list_lock); if (cfile->srch_inf.ntwrk_buf_start) { - cFYI(1, "freeing SMB ff cache buf on search rewind"); + cifs_dbg(FYI, "freeing SMB ff cache buf on search rewind\n"); if (cfile->srch_inf.smallBuf) cifs_small_buf_release(cfile->srch_inf. ntwrk_buf_start); @@ -593,7 +593,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, } rc = initiate_cifs_search(xid, file); if (rc) { - cFYI(1, "error %d reinitiating a search on rewind", + cifs_dbg(FYI, "error %d reinitiating a search on rewind\n", rc); return rc; } @@ -608,7 +608,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, while ((index_to_find >= cfile->srch_inf.index_of_last_entry) && (rc == 0) && !cfile->srch_inf.endOfSearch) { - cFYI(1, "calling findnext2"); + cifs_dbg(FYI, "calling findnext2\n"); rc = server->ops->query_dir_next(xid, tcon, &cfile->fid, search_flags, &cfile->srch_inf); @@ -631,7 +631,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, first_entry_in_buffer = cfile->srch_inf.index_of_last_entry - cfile->srch_inf.entries_in_buffer; pos_in_buf = index_to_find - first_entry_in_buffer; - cFYI(1, "found entry - pos_in_buf %d", pos_in_buf); + cifs_dbg(FYI, "found entry - pos_in_buf %d\n", pos_in_buf); for (i = 0; (i < (pos_in_buf)) && (cur_ent != NULL); i++) { /* go entry by entry figuring out which is first */ @@ -640,19 +640,18 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, } if ((cur_ent == NULL) && (i < pos_in_buf)) { /* BB fixme - check if we should flag this error */ - cERROR(1, "reached end of buf searching for pos in buf" - " %d index to find %lld rc %d", pos_in_buf, - index_to_find, rc); + cifs_dbg(VFS, "reached end of buf searching for pos in buf %d index to find %lld rc %d\n", + pos_in_buf, index_to_find, rc); } rc = 0; *current_entry = cur_ent; } else { - cFYI(1, "index not in buffer - could not findnext into it"); + cifs_dbg(FYI, "index not in buffer - could not findnext into it\n"); return 0; } if (pos_in_buf >= cfile->srch_inf.entries_in_buffer) { - cFYI(1, "can not return entries pos_in_buf beyond last"); + cifs_dbg(FYI, "can not return entries pos_in_buf beyond last\n"); *num_to_ret = 0; } else *num_to_ret = cfile->srch_inf.entries_in_buffer - pos_in_buf; @@ -678,8 +677,8 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir, return rc; if (de.namelen > max_len) { - cERROR(1, "bad search response length %zd past smb end", - de.namelen); + cifs_dbg(VFS, "bad search response length %zd past smb end\n", + de.namelen); return -EINVAL; } @@ -768,7 +767,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) */ if (file->private_data == NULL) { rc = initiate_cifs_search(xid, file); - cFYI(1, "initiate cifs search rc %d", rc); + cifs_dbg(FYI, "initiate cifs search rc %d\n", rc); if (rc) goto rddir2_exit; } @@ -777,7 +776,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) case 0: if (filldir(direntry, ".", 1, file->f_pos, file_inode(file)->i_ino, DT_DIR) < 0) { - cERROR(1, "Filldir for current dir failed"); + cifs_dbg(VFS, "Filldir for current dir failed\n"); rc = -ENOMEM; break; } @@ -785,7 +784,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) case 1: if (filldir(direntry, "..", 2, file->f_pos, parent_ino(file->f_path.dentry), DT_DIR) < 0) { - cERROR(1, "Filldir for parent dir failed"); + cifs_dbg(VFS, "Filldir for parent dir failed\n"); rc = -ENOMEM; break; } @@ -804,7 +803,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) cifsFile = file->private_data; if (cifsFile->srch_inf.endOfSearch) { if (cifsFile->srch_inf.emptyDir) { - cFYI(1, "End of search, empty dir"); + cifs_dbg(FYI, "End of search, empty dir\n"); rc = 0; break; } @@ -817,16 +816,16 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) rc = find_cifs_entry(xid, tcon, file, ¤t_entry, &num_to_fill); if (rc) { - cFYI(1, "fce error %d", rc); + cifs_dbg(FYI, "fce error %d\n", rc); goto rddir2_exit; } else if (current_entry != NULL) { - cFYI(1, "entry %lld found", file->f_pos); + cifs_dbg(FYI, "entry %lld found\n", file->f_pos); } else { - cFYI(1, "could not find entry"); + cifs_dbg(FYI, "could not find entry\n"); goto rddir2_exit; } - cFYI(1, "loop through %d times filling dir for net buf %p", - num_to_fill, cifsFile->srch_inf.ntwrk_buf_start); + cifs_dbg(FYI, "loop through %d times filling dir for net buf %p\n", + num_to_fill, cifsFile->srch_inf.ntwrk_buf_start); max_len = tcon->ses->server->ops->calc_smb_size( cifsFile->srch_inf.ntwrk_buf_start); end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len; @@ -840,8 +839,8 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) for (i = 0; (i < num_to_fill) && (rc == 0); i++) { if (current_entry == NULL) { /* evaluate whether this case is an error */ - cERROR(1, "past SMB end, num to fill %d i %d", - num_to_fill, i); + cifs_dbg(VFS, "past SMB end, num to fill %d i %d\n", + num_to_fill, i); break; } /* @@ -858,8 +857,8 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) file->f_pos++; if (file->f_pos == cifsFile->srch_inf.index_of_last_entry) { - cFYI(1, "last entry in buf at pos %lld %s", - file->f_pos, tmp_buf); + cifs_dbg(FYI, "last entry in buf at pos %lld %s\n", + file->f_pos, tmp_buf); cifs_save_resume_key(current_entry, cifsFile); break; } else diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 76809f4d3428..f230571a7ab3 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -283,11 +283,11 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses, int len; char *data = *pbcc_area; - cFYI(1, "bleft %d", bleft); + cifs_dbg(FYI, "bleft %d\n", bleft); kfree(ses->serverOS); ses->serverOS = cifs_strndup_from_utf16(data, bleft, true, nls_cp); - cFYI(1, "serverOS=%s", ses->serverOS); + cifs_dbg(FYI, "serverOS=%s\n", ses->serverOS); len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2; data += len; bleft -= len; @@ -296,7 +296,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses, kfree(ses->serverNOS); ses->serverNOS = cifs_strndup_from_utf16(data, bleft, true, nls_cp); - cFYI(1, "serverNOS=%s", ses->serverNOS); + cifs_dbg(FYI, "serverNOS=%s\n", ses->serverNOS); len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2; data += len; bleft -= len; @@ -305,7 +305,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses, kfree(ses->serverDomain); ses->serverDomain = cifs_strndup_from_utf16(data, bleft, true, nls_cp); - cFYI(1, "serverDomain=%s", ses->serverDomain); + cifs_dbg(FYI, "serverDomain=%s\n", ses->serverDomain); return; } @@ -318,7 +318,7 @@ static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft, int len; char *bcc_ptr = *pbcc_area; - cFYI(1, "decode sessetup ascii. bleft %d", bleft); + cifs_dbg(FYI, "decode sessetup ascii. bleft %d\n", bleft); len = strnlen(bcc_ptr, bleft); if (len >= bleft) @@ -330,7 +330,7 @@ static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft, if (ses->serverOS) strncpy(ses->serverOS, bcc_ptr, len); if (strncmp(ses->serverOS, "OS/2", 4) == 0) { - cFYI(1, "OS/2 server"); + cifs_dbg(FYI, "OS/2 server\n"); ses->flags |= CIFS_SES_OS2; } @@ -359,7 +359,7 @@ static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft, /* BB For newer servers which do not support Unicode, but thus do return domain here we could add parsing for it later, but it is not very important */ - cFYI(1, "ascii: bytes left %d", bleft); + cifs_dbg(FYI, "ascii: bytes left %d\n", bleft); return rc; } @@ -373,16 +373,18 @@ int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr; if (blob_len < sizeof(CHALLENGE_MESSAGE)) { - cERROR(1, "challenge blob len %d too small", blob_len); + cifs_dbg(VFS, "challenge blob len %d too small\n", blob_len); return -EINVAL; } if (memcmp(pblob->Signature, "NTLMSSP", 8)) { - cERROR(1, "blob signature incorrect %s", pblob->Signature); + cifs_dbg(VFS, "blob signature incorrect %s\n", + pblob->Signature); return -EINVAL; } if (pblob->MessageType != NtLmChallenge) { - cERROR(1, "Incorrect message type %d", pblob->MessageType); + cifs_dbg(VFS, "Incorrect message type %d\n", + pblob->MessageType); return -EINVAL; } @@ -395,16 +397,17 @@ int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, tioffset = le32_to_cpu(pblob->TargetInfoArray.BufferOffset); tilen = le16_to_cpu(pblob->TargetInfoArray.Length); if (tioffset > blob_len || tioffset + tilen > blob_len) { - cERROR(1, "tioffset + tilen too high %u + %u", tioffset, tilen); + cifs_dbg(VFS, "tioffset + tilen too high %u + %u", + tioffset, tilen); return -EINVAL; } if (tilen) { - ses->auth_key.response = kmalloc(tilen, GFP_KERNEL); + ses->auth_key.response = kmemdup(bcc_ptr + tioffset, tilen, + GFP_KERNEL); if (!ses->auth_key.response) { - cERROR(1, "Challenge target info allocation failure"); + cifs_dbg(VFS, "Challenge target info alloc failure"); return -ENOMEM; } - memcpy(ses->auth_key.response, bcc_ptr + tioffset, tilen); ses->auth_key.len = tilen; } @@ -486,7 +489,7 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer, sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer); rc = setup_ntlmv2_rsp(ses, nls_cp); if (rc) { - cERROR(1, "Error %d during NTLMSSP authentication", rc); + cifs_dbg(VFS, "Error %d during NTLMSSP authentication\n", rc); goto setup_ntlmv2_ret; } memcpy(tmp, ses->auth_key.response + CIFS_SESS_KEY_SIZE, @@ -580,7 +583,7 @@ CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, return -EINVAL; type = ses->server->secType; - cFYI(1, "sess setup type %d", type); + cifs_dbg(FYI, "sess setup type %d\n", type); if (type == RawNTLMSSP) { /* if memory allocation is successful, caller of this function * frees it. @@ -674,7 +677,7 @@ ssetup_ntlmssp_authenticate: changed to do higher than lanman dialect and we reconnected would we ever calc signing_key? */ - cFYI(1, "Negotiating LANMAN setting up strings"); + cifs_dbg(FYI, "Negotiating LANMAN setting up strings\n"); /* Unicode not allowed for LANMAN dialects */ ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); #endif @@ -688,7 +691,8 @@ ssetup_ntlmssp_authenticate: /* calculate ntlm response and session key */ rc = setup_ntlm_response(ses, nls_cp); if (rc) { - cERROR(1, "Error %d during NTLM authentication", rc); + cifs_dbg(VFS, "Error %d during NTLM authentication\n", + rc); goto ssetup_exit; } @@ -718,7 +722,8 @@ ssetup_ntlmssp_authenticate: /* calculate nlmv2 response and session key */ rc = setup_ntlmv2_rsp(ses, nls_cp); if (rc) { - cERROR(1, "Error %d during NTLMv2 authentication", rc); + cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n", + rc); goto ssetup_exit; } memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, @@ -754,21 +759,21 @@ ssetup_ntlmssp_authenticate: /* check version field to make sure that cifs.upcall is sending us a response in an expected form */ if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) { - cERROR(1, "incorrect version of cifs.upcall (expected" - " %d but got %d)", + cifs_dbg(VFS, "incorrect version of cifs.upcall " + "expected %d but got %d)", CIFS_SPNEGO_UPCALL_VERSION, msg->version); rc = -EKEYREJECTED; goto ssetup_exit; } - ses->auth_key.response = kmalloc(msg->sesskey_len, GFP_KERNEL); + ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, + GFP_KERNEL); if (!ses->auth_key.response) { - cERROR(1, "Kerberos can't allocate (%u bytes) memory", + cifs_dbg(VFS, "Kerberos can't allocate (%u bytes) memory", msg->sesskey_len); rc = -ENOMEM; goto ssetup_exit; } - memcpy(ses->auth_key.response, msg->data, msg->sesskey_len); ses->auth_key.len = msg->sesskey_len; pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; @@ -790,18 +795,18 @@ ssetup_ntlmssp_authenticate: /* BB: is this right? */ ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); #else /* ! CONFIG_CIFS_UPCALL */ - cERROR(1, "Kerberos negotiated but upcall support disabled!"); + cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n"); rc = -ENOSYS; goto ssetup_exit; #endif /* CONFIG_CIFS_UPCALL */ } else if (type == RawNTLMSSP) { if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) { - cERROR(1, "NTLMSSP requires Unicode support"); + cifs_dbg(VFS, "NTLMSSP requires Unicode support\n"); rc = -ENOSYS; goto ssetup_exit; } - cFYI(1, "ntlmssp session setup phase %d", phase); + cifs_dbg(FYI, "ntlmssp session setup phase %d\n", phase); pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; capabilities |= CAP_EXTENDED_SECURITY; pSMB->req.Capabilities |= cpu_to_le32(capabilities); @@ -824,7 +829,6 @@ ssetup_ntlmssp_authenticate: 5*sizeof(struct _AUTHENTICATE_MESSAGE), GFP_KERNEL); if (!ntlmsspblob) { - cERROR(1, "Can't allocate NTLMSSP blob"); rc = -ENOMEM; goto ssetup_exit; } @@ -844,7 +848,7 @@ ssetup_ntlmssp_authenticate: smb_buf->Uid = ses->Suid; break; default: - cERROR(1, "invalid phase %d", phase); + cifs_dbg(VFS, "invalid phase %d\n", phase); rc = -ENOSYS; goto ssetup_exit; } @@ -855,7 +859,7 @@ ssetup_ntlmssp_authenticate: } unicode_oslm_strings(&bcc_ptr, nls_cp); } else { - cERROR(1, "secType %d not supported!", type); + cifs_dbg(VFS, "secType %d not supported!\n", type); rc = -ENOSYS; goto ssetup_exit; } @@ -880,7 +884,7 @@ ssetup_ntlmssp_authenticate: (smb_buf->Status.CifsError == cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) { if (phase != NtLmNegotiate) { - cERROR(1, "Unexpected more processing error"); + cifs_dbg(VFS, "Unexpected more processing error\n"); goto ssetup_exit; } /* NTLMSSP Negotiate sent now processing challenge (response) */ @@ -892,14 +896,14 @@ ssetup_ntlmssp_authenticate: if ((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) { rc = -EIO; - cERROR(1, "bad word count %d", smb_buf->WordCount); + cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); goto ssetup_exit; } action = le16_to_cpu(pSMB->resp.Action); if (action & GUEST_LOGIN) - cFYI(1, "Guest login"); /* BB mark SesInfo struct? */ + cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */ ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ - cFYI(1, "UID = %llu ", ses->Suid); + cifs_dbg(FYI, "UID = %llu\n", ses->Suid); /* response can have either 3 or 4 word count - Samba sends 3 */ /* and lanman response is 3 */ bytes_remaining = get_bcc(smb_buf); @@ -908,7 +912,8 @@ ssetup_ntlmssp_authenticate: if (smb_buf->WordCount == 4) { blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength); if (blob_len > bytes_remaining) { - cERROR(1, "bad security blob length %d", blob_len); + cifs_dbg(VFS, "bad security blob length %d\n", + blob_len); rc = -EINVAL; goto ssetup_exit; } @@ -946,7 +951,7 @@ ssetup_exit: kfree(ntlmsspblob); ntlmsspblob = NULL; if (resp_buf_type == CIFS_SMALL_BUFFER) { - cFYI(1, "ssetup freeing small buf %p", iov[0].iov_base); + cifs_dbg(FYI, "ssetup freeing small buf %p\n", iov[0].iov_base); cifs_small_buf_release(iov[0].iov_base); } else if (resp_buf_type == CIFS_LARGE_BUFFER) cifs_buf_release(iov[0].iov_base); diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 47bc5a87f94e..3efdb9d5c0b8 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -61,10 +61,13 @@ send_nt_cancel(struct TCP_Server_Info *server, void *buf, */ --server->sequence_number; rc = smb_send(server, in_buf, be32_to_cpu(in_buf->smb_buf_length)); + if (rc < 0) + server->sequence_number--; + mutex_unlock(&server->srv_mutex); - cFYI(1, "issued NT_CANCEL for mid %u, rc = %d", - in_buf->Mid, rc); + cifs_dbg(FYI, "issued NT_CANCEL for mid %u, rc = %d\n", + in_buf->Mid, rc); return rc; } @@ -249,7 +252,7 @@ check2ndT2(char *buf) /* check for plausible wct, bcc and t2 data and parm sizes */ /* check for parm and data offset going beyond end of smb */ if (pSMB->WordCount != 10) { /* coalesce_t2 depends on this */ - cFYI(1, "invalid transact2 word count"); + cifs_dbg(FYI, "invalid transact2 word count\n"); return -EINVAL; } @@ -261,18 +264,18 @@ check2ndT2(char *buf) if (total_data_size == data_in_this_rsp) return 0; else if (total_data_size < data_in_this_rsp) { - cFYI(1, "total data %d smaller than data in frame %d", - total_data_size, data_in_this_rsp); + cifs_dbg(FYI, "total data %d smaller than data in frame %d\n", + total_data_size, data_in_this_rsp); return -EINVAL; } remaining = total_data_size - data_in_this_rsp; - cFYI(1, "missing %d bytes from transact2, check next response", - remaining); + cifs_dbg(FYI, "missing %d bytes from transact2, check next response\n", + remaining); if (total_data_size > CIFSMaxBufSize) { - cERROR(1, "TotalDataSize %d is over maximum buffer %d", - total_data_size, CIFSMaxBufSize); + cifs_dbg(VFS, "TotalDataSize %d is over maximum buffer %d\n", + total_data_size, CIFSMaxBufSize); return -EINVAL; } return remaining; @@ -293,28 +296,28 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr) tgt_total_cnt = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount); if (tgt_total_cnt != src_total_cnt) - cFYI(1, "total data count of primary and secondary t2 differ " - "source=%hu target=%hu", src_total_cnt, tgt_total_cnt); + cifs_dbg(FYI, "total data count of primary and secondary t2 differ source=%hu target=%hu\n", + src_total_cnt, tgt_total_cnt); total_in_tgt = get_unaligned_le16(&pSMBt->t2_rsp.DataCount); remaining = tgt_total_cnt - total_in_tgt; if (remaining < 0) { - cFYI(1, "Server sent too much data. tgt_total_cnt=%hu " - "total_in_tgt=%hu", tgt_total_cnt, total_in_tgt); + cifs_dbg(FYI, "Server sent too much data. tgt_total_cnt=%hu total_in_tgt=%hu\n", + tgt_total_cnt, total_in_tgt); return -EPROTO; } if (remaining == 0) { /* nothing to do, ignore */ - cFYI(1, "no more data remains"); + cifs_dbg(FYI, "no more data remains\n"); return 0; } total_in_src = get_unaligned_le16(&pSMBs->t2_rsp.DataCount); if (remaining < total_in_src) - cFYI(1, "transact2 2nd response contains too much data"); + cifs_dbg(FYI, "transact2 2nd response contains too much data\n"); /* find end of first SMB data area */ data_area_of_tgt = (char *)&pSMBt->hdr.Protocol + @@ -329,7 +332,8 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr) total_in_tgt += total_in_src; /* is the result too big for the field? */ if (total_in_tgt > USHRT_MAX) { - cFYI(1, "coalesced DataCount too large (%u)", total_in_tgt); + cifs_dbg(FYI, "coalesced DataCount too large (%u)\n", + total_in_tgt); return -EPROTO; } put_unaligned_le16(total_in_tgt, &pSMBt->t2_rsp.DataCount); @@ -339,7 +343,7 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr) byte_count += total_in_src; /* is the result too big for the field? */ if (byte_count > USHRT_MAX) { - cFYI(1, "coalesced BCC too large (%u)", byte_count); + cifs_dbg(FYI, "coalesced BCC too large (%u)\n", byte_count); return -EPROTO; } put_bcc(byte_count, target_hdr); @@ -348,7 +352,8 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr) byte_count += total_in_src; /* don't allow buffer to overflow */ if (byte_count > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { - cFYI(1, "coalesced BCC exceeds buffer size (%u)", byte_count); + cifs_dbg(FYI, "coalesced BCC exceeds buffer size (%u)\n", + byte_count); return -ENOBUFS; } target_hdr->smb_buf_length = cpu_to_be32(byte_count); @@ -358,12 +363,12 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr) if (remaining != total_in_src) { /* more responses to go */ - cFYI(1, "waiting for more secondary responses"); + cifs_dbg(FYI, "waiting for more secondary responses\n"); return 1; } /* we are done */ - cFYI(1, "found the last secondary response"); + cifs_dbg(FYI, "found the last secondary response\n"); return 0; } @@ -388,7 +393,7 @@ cifs_check_trans2(struct mid_q_entry *mid, struct TCP_Server_Info *server, } if (!server->large_buf) { /*FIXME: switch to already allocated largebuf?*/ - cERROR(1, "1st trans2 resp needs bigbuf"); + cifs_dbg(VFS, "1st trans2 resp needs bigbuf\n"); } else { /* Have first buffer */ mid->resp_buf = buf; @@ -776,8 +781,7 @@ smb_set_file_info(struct inode *inode, const char *full_path, goto out; } - cFYI(1, "calling SetFileInfo since SetPathInfo for times not supported " - "by this server"); + cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n"); rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN, SYNCHRONIZE | FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR, &netfid, &oplock, NULL, cifs_sb->local_nls, diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 71e6aed4b382..5da1b55a2258 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -43,13 +43,13 @@ smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock) if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) { cinode->clientCanCacheAll = true; cinode->clientCanCacheRead = true; - cFYI(1, "Exclusive Oplock granted on inode %p", - &cinode->vfs_inode); + cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n", + &cinode->vfs_inode); } else if (oplock == SMB2_OPLOCK_LEVEL_II) { cinode->clientCanCacheAll = false; cinode->clientCanCacheRead = true; - cFYI(1, "Level II Oplock granted on inode %p", - &cinode->vfs_inode); + cifs_dbg(FYI, "Level II Oplock granted on inode %p\n", + &cinode->vfs_inode); } else { cinode->clientCanCacheAll = false; cinode->clientCanCacheRead = false; diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 706482452df4..fff6dfba6204 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -92,7 +92,7 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon, (FILE_BASIC_INFO *)data); break; default: - cERROR(1, "Invalid command"); + cifs_dbg(VFS, "Invalid command\n"); break; } diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c index 494c912c76fe..7c2f45c06fc2 100644 --- a/fs/cifs/smb2maperror.c +++ b/fs/cifs/smb2maperror.c @@ -2472,7 +2472,7 @@ map_smb2_to_linux_error(char *buf, bool log_err) /* on error mapping not found - return EIO */ - cFYI(1, "Mapping SMB2 status code %d to POSIX err %d", + cifs_dbg(FYI, "Mapping SMB2 status code %d to POSIX err %d\n", smb2err, rc); return rc; diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 7b1c5e3287fb..10383d8c015b 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -45,17 +45,17 @@ check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid) if (hdr->Command == SMB2_OPLOCK_BREAK) return 0; else - cERROR(1, "Received Request not response"); + cifs_dbg(VFS, "Received Request not response\n"); } } else { /* bad signature or mid */ if (*(__le32 *)hdr->ProtocolId != SMB2_PROTO_NUMBER) - cERROR(1, "Bad protocol string signature header %x", - *(unsigned int *) hdr->ProtocolId); + cifs_dbg(VFS, "Bad protocol string signature header %x\n", + *(unsigned int *) hdr->ProtocolId); if (mid != hdr->MessageId) - cERROR(1, "Mids do not match: %llu and %llu", mid, - hdr->MessageId); + cifs_dbg(VFS, "Mids do not match: %llu and %llu\n", + mid, hdr->MessageId); } - cERROR(1, "Bad SMB detected. The Mid=%llu", hdr->MessageId); + cifs_dbg(VFS, "Bad SMB detected. The Mid=%llu\n", hdr->MessageId); return 1; } @@ -101,7 +101,8 @@ smb2_check_message(char *buf, unsigned int length) int command; /* BB disable following printk later */ - cFYI(1, "%s length: 0x%x, smb_buf_length: 0x%x", __func__, length, len); + cifs_dbg(FYI, "%s length: 0x%x, smb_buf_length: 0x%x\n", + __func__, length, len); /* * Add function to do table lookup of StructureSize by command @@ -117,12 +118,13 @@ smb2_check_message(char *buf, unsigned int length) */ return 0; } else { - cERROR(1, "Length less than SMB header size"); + cifs_dbg(VFS, "Length less than SMB header size\n"); } return 1; } if (len > CIFSMaxBufSize + MAX_SMB2_HDR_SIZE - 4) { - cERROR(1, "SMB length greater than maximum, mid=%llu", mid); + cifs_dbg(VFS, "SMB length greater than maximum, mid=%llu\n", + mid); return 1; } @@ -130,14 +132,14 @@ smb2_check_message(char *buf, unsigned int length) return 1; if (hdr->StructureSize != SMB2_HEADER_STRUCTURE_SIZE) { - cERROR(1, "Illegal structure size %u", - le16_to_cpu(hdr->StructureSize)); + cifs_dbg(VFS, "Illegal structure size %u\n", + le16_to_cpu(hdr->StructureSize)); return 1; } command = le16_to_cpu(hdr->Command); if (command >= NUMBER_OF_SMB2_COMMANDS) { - cERROR(1, "Illegal SMB2 command %d", command); + cifs_dbg(VFS, "Illegal SMB2 command %d\n", command); return 1; } @@ -145,30 +147,30 @@ smb2_check_message(char *buf, unsigned int length) if (command != SMB2_OPLOCK_BREAK_HE && (hdr->Status == 0 || pdu->StructureSize2 != SMB2_ERROR_STRUCTURE_SIZE2)) { /* error packets have 9 byte structure size */ - cERROR(1, "Illegal response size %u for command %d", - le16_to_cpu(pdu->StructureSize2), command); + cifs_dbg(VFS, "Illegal response size %u for command %d\n", + le16_to_cpu(pdu->StructureSize2), command); return 1; } else if (command == SMB2_OPLOCK_BREAK_HE && (hdr->Status == 0) && (le16_to_cpu(pdu->StructureSize2) != 44) && (le16_to_cpu(pdu->StructureSize2) != 36)) { /* special case for SMB2.1 lease break message */ - cERROR(1, "Illegal response size %d for oplock break", - le16_to_cpu(pdu->StructureSize2)); + cifs_dbg(VFS, "Illegal response size %d for oplock break\n", + le16_to_cpu(pdu->StructureSize2)); return 1; } } if (4 + len != length) { - cERROR(1, "Total length %u RFC1002 length %u mismatch mid %llu", - length, 4 + len, mid); + cifs_dbg(VFS, "Total length %u RFC1002 length %u mismatch mid %llu\n", + length, 4 + len, mid); return 1; } clc_len = smb2_calc_size(hdr); if (4 + len != clc_len) { - cFYI(1, "Calculated size %u length %u mismatch mid %llu", - clc_len, 4 + len, mid); + cifs_dbg(FYI, "Calculated size %u length %u mismatch mid %llu\n", + clc_len, 4 + len, mid); /* Windows 7 server returns 24 bytes more */ if (clc_len + 20 == len && command == SMB2_OPLOCK_BREAK_HE) return 0; @@ -267,7 +269,7 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr) case SMB2_CHANGE_NOTIFY: default: /* BB FIXME for unimplemented cases above */ - cERROR(1, "no length check for command"); + cifs_dbg(VFS, "no length check for command\n"); break; } @@ -276,20 +278,20 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr) * we have little choice but to ignore the data area in this case. */ if (*off > 4096) { - cERROR(1, "offset %d too large, data area ignored", *off); + cifs_dbg(VFS, "offset %d too large, data area ignored\n", *off); *len = 0; *off = 0; } else if (*off < 0) { - cERROR(1, "negative offset %d to data invalid ignore data area", - *off); + cifs_dbg(VFS, "negative offset %d to data invalid ignore data area\n", + *off); *off = 0; *len = 0; } else if (*len < 0) { - cERROR(1, "negative data length %d invalid, data area ignored", - *len); + cifs_dbg(VFS, "negative data length %d invalid, data area ignored\n", + *len); *len = 0; } else if (*len > 128 * 1024) { - cERROR(1, "data area larger than 128K: %d", *len); + cifs_dbg(VFS, "data area larger than 128K: %d\n", *len); *len = 0; } @@ -324,7 +326,7 @@ smb2_calc_size(void *buf) goto calc_size_exit; smb2_get_data_area_len(&offset, &data_length, hdr); - cFYI(1, "SMB2 data length %d offset %d", data_length, offset); + cifs_dbg(FYI, "SMB2 data length %d offset %d\n", data_length, offset); if (data_length > 0) { /* @@ -335,15 +337,15 @@ smb2_calc_size(void *buf) * the size of the RFC1001 hdr. */ if (offset + 4 + 1 < len) { - cERROR(1, "data area offset %d overlaps SMB2 header %d", - offset + 4 + 1, len); + cifs_dbg(VFS, "data area offset %d overlaps SMB2 header %d\n", + offset + 4 + 1, len); data_length = 0; } else { len = 4 + offset + data_length; } } calc_size_exit: - cFYI(1, "SMB2 len %d", len); + cifs_dbg(FYI, "SMB2 len %d\n", len); return len; } @@ -405,7 +407,7 @@ cifs_ses_oplock_break(struct work_struct *work) rc = SMB2_lease_break(0, tlink_tcon(lw->tlink), lw->lease_key, lw->lease_state); - cFYI(1, "Lease release rc %d", rc); + cifs_dbg(FYI, "Lease release rc %d\n", rc); cifs_put_tlink(lw->tlink); kfree(lw); } @@ -426,15 +428,13 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server) SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED); lw = kmalloc(sizeof(struct smb2_lease_break_work), GFP_KERNEL); - if (!lw) { - cERROR(1, "Memory allocation failed during lease break check"); + if (!lw) return false; - } INIT_WORK(&lw->lease_break, cifs_ses_oplock_break); lw->lease_state = rsp->NewLeaseState; - cFYI(1, "Checking for lease break"); + cifs_dbg(FYI, "Checking for lease break\n"); /* look up tcon based on tid & uid */ spin_lock(&cifs_tcp_ses_lock); @@ -455,9 +455,9 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server) SMB2_LEASE_KEY_SIZE)) continue; - cFYI(1, "found in the open list"); - cFYI(1, "lease key match, lease break 0x%d", - le32_to_cpu(rsp->NewLeaseState)); + cifs_dbg(FYI, "found in the open list\n"); + cifs_dbg(FYI, "lease key match, lease break 0x%d\n", + le32_to_cpu(rsp->NewLeaseState)); smb2_set_oplock_level(cinode, smb2_map_lease_to_oplock(rsp->NewLeaseState)); @@ -489,9 +489,9 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server) &lw->lease_break); } - cFYI(1, "found in the pending open list"); - cFYI(1, "lease key match, lease break 0x%d", - le32_to_cpu(rsp->NewLeaseState)); + cifs_dbg(FYI, "found in the pending open list\n"); + cifs_dbg(FYI, "lease key match, lease break 0x%d\n", + le32_to_cpu(rsp->NewLeaseState)); open->oplock = smb2_map_lease_to_oplock(rsp->NewLeaseState); @@ -506,7 +506,7 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server) } spin_unlock(&cifs_tcp_ses_lock); kfree(lw); - cFYI(1, "Can not process lease break - no lease matched"); + cifs_dbg(FYI, "Can not process lease break - no lease matched\n"); return false; } @@ -520,7 +520,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) struct cifsInodeInfo *cinode; struct cifsFileInfo *cfile; - cFYI(1, "Checking for oplock break"); + cifs_dbg(FYI, "Checking for oplock break\n"); if (rsp->hdr.Command != SMB2_OPLOCK_BREAK) return false; @@ -533,7 +533,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) return false; } - cFYI(1, "oplock level 0x%d", rsp->OplockLevel); + cifs_dbg(FYI, "oplock level 0x%d\n", rsp->OplockLevel); /* look up tcon based on tid & uid */ spin_lock(&cifs_tcp_ses_lock); @@ -553,7 +553,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) cfile->fid.volatile_fid) continue; - cFYI(1, "file id match, oplock break"); + cifs_dbg(FYI, "file id match, oplock break\n"); cinode = CIFS_I(cfile->dentry->d_inode); if (!cinode->clientCanCacheAll && @@ -573,11 +573,11 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) } spin_unlock(&cifs_file_list_lock); spin_unlock(&cifs_tcp_ses_lock); - cFYI(1, "No matching file for oplock break"); + cifs_dbg(FYI, "No matching file for oplock break\n"); return true; } } spin_unlock(&cifs_tcp_ses_lock); - cFYI(1, "Can not process oplock break for non-existent connection"); + cifs_dbg(FYI, "Can not process oplock break for non-existent connection\n"); return false; } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index c9c7aa7ed966..f2e76f3b0c61 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -38,13 +38,13 @@ change_conf(struct TCP_Server_Info *server) case 1: server->echoes = false; server->oplocks = false; - cERROR(1, "disabling echoes and oplocks"); + cifs_dbg(VFS, "disabling echoes and oplocks\n"); break; case 2: server->echoes = true; server->oplocks = false; server->echo_credits = 1; - cFYI(1, "disabling oplocks"); + cifs_dbg(FYI, "disabling oplocks\n"); break; default: server->echoes = true; @@ -147,10 +147,10 @@ smb2_dump_detail(void *buf) #ifdef CONFIG_CIFS_DEBUG2 struct smb2_hdr *smb = (struct smb2_hdr *)buf; - cERROR(1, "Cmd: %d Err: 0x%x Flags: 0x%x Mid: %llu Pid: %d", - smb->Command, smb->Status, smb->Flags, smb->MessageId, - smb->ProcessId); - cERROR(1, "smb buf %p len %u", smb, smb2_calc_size(smb)); + cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Mid: %llu Pid: %d\n", + smb->Command, smb->Status, smb->Flags, smb->MessageId, + smb->ProcessId); + cifs_dbg(VFS, "smb buf %p len %u\n", smb, smb2_calc_size(smb)); #endif } @@ -436,7 +436,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, &oplock, NULL); kfree(utf16_path); if (rc) { - cERROR(1, "open dir failed"); + cifs_dbg(VFS, "open dir failed\n"); return rc; } @@ -448,7 +448,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_query_directory(xid, tcon, persistent_fid, volatile_fid, 0, srch_inf); if (rc) { - cERROR(1, "query directory failed"); + cifs_dbg(VFS, "query directory failed\n"); SMB2_close(xid, tcon, persistent_fid, volatile_fid); } return rc; @@ -744,4 +744,5 @@ struct smb_version_values smb30_values = { .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, .cap_large_files = SMB2_LARGE_FILES, + .oplock_read = SMB2_OPLOCK_LEVEL_II, }; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 41d9d0725f0f..2b95ce2b54e8 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -155,8 +155,8 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) if ((smb2_command != SMB2_WRITE) && (smb2_command != SMB2_CREATE) && (smb2_command != SMB2_TREE_DISCONNECT)) { - cFYI(1, "can not send cmd %d while umounting", - smb2_command); + cifs_dbg(FYI, "can not send cmd %d while umounting\n", + smb2_command); return -ENODEV; } } @@ -200,7 +200,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) * back on-line */ if (!tcon->retry) { - cFYI(1, "gave up waiting on reconnect in smb_init"); + cifs_dbg(FYI, "gave up waiting on reconnect in smb_init\n"); return -EHOSTDOWN; } } @@ -227,7 +227,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) cifs_mark_open_files_invalid(tcon); rc = SMB2_tcon(0, tcon->ses, tcon->treeName, tcon, nls_codepage); mutex_unlock(&tcon->ses->session_mutex); - cFYI(1, "reconnect tcon rc = %d", rc); + cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc); if (rc) goto out; atomic_inc(&tconInfoReconnectCount); @@ -335,7 +335,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) char *security_blob; int flags = CIFS_NEG_OP; - cFYI(1, "Negotiate protocol"); + cifs_dbg(FYI, "Negotiate protocol\n"); if (ses->server) server = ses->server; @@ -354,7 +354,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) else /* if override flags set only sign/seal OR them with global auth */ sec_flags = global_secflags | ses->overrideSecFlg; - cFYI(1, "sec_flags 0x%x", sec_flags); + cifs_dbg(FYI, "sec_flags 0x%x\n", sec_flags); req->hdr.SessionId = 0; @@ -389,19 +389,19 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) if (rc != 0) goto neg_exit; - cFYI(1, "mode 0x%x", rsp->SecurityMode); + cifs_dbg(FYI, "mode 0x%x\n", rsp->SecurityMode); /* BB we may eventually want to match the negotiated vs. requested dialect, even though we are only requesting one at a time */ if (rsp->DialectRevision == cpu_to_le16(SMB20_PROT_ID)) - cFYI(1, "negotiated smb2.0 dialect"); + cifs_dbg(FYI, "negotiated smb2.0 dialect\n"); else if (rsp->DialectRevision == cpu_to_le16(SMB21_PROT_ID)) - cFYI(1, "negotiated smb2.1 dialect"); + cifs_dbg(FYI, "negotiated smb2.1 dialect\n"); else if (rsp->DialectRevision == cpu_to_le16(SMB30_PROT_ID)) - cFYI(1, "negotiated smb3.0 dialect"); + cifs_dbg(FYI, "negotiated smb3.0 dialect\n"); else { - cERROR(1, "Illegal dialect returned by server %d", - le16_to_cpu(rsp->DialectRevision)); + cifs_dbg(VFS, "Illegal dialect returned by server %d\n", + le16_to_cpu(rsp->DialectRevision)); rc = -EIO; goto neg_exit; } @@ -419,35 +419,34 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) security_blob = smb2_get_data_area_len(&blob_offset, &blob_length, &rsp->hdr); if (blob_length == 0) { - cERROR(1, "missing security blob on negprot"); + cifs_dbg(VFS, "missing security blob on negprot\n"); rc = -EIO; goto neg_exit; } - cFYI(1, "sec_flags 0x%x", sec_flags); + cifs_dbg(FYI, "sec_flags 0x%x\n", sec_flags); if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) { - cFYI(1, "Signing required"); + cifs_dbg(FYI, "Signing required\n"); if (!(server->sec_mode & (SMB2_NEGOTIATE_SIGNING_REQUIRED | SMB2_NEGOTIATE_SIGNING_ENABLED))) { - cERROR(1, "signing required but server lacks support"); + cifs_dbg(VFS, "signing required but server lacks support\n"); rc = -EOPNOTSUPP; goto neg_exit; } server->sec_mode |= SECMODE_SIGN_REQUIRED; } else if (sec_flags & CIFSSEC_MAY_SIGN) { - cFYI(1, "Signing optional"); + cifs_dbg(FYI, "Signing optional\n"); if (server->sec_mode & SMB2_NEGOTIATE_SIGNING_REQUIRED) { - cFYI(1, "Server requires signing"); + cifs_dbg(FYI, "Server requires signing\n"); server->sec_mode |= SECMODE_SIGN_REQUIRED; } else { server->sec_mode &= ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED); } } else { - cFYI(1, "Signing disabled"); + cifs_dbg(FYI, "Signing disabled\n"); if (server->sec_mode & SMB2_NEGOTIATE_SIGNING_REQUIRED) { - cERROR(1, "Server requires packet signing to be enabled" - " in /proc/fs/cifs/SecurityFlags."); + cifs_dbg(VFS, "Server requires packet signing to be enabled in /proc/fs/cifs/SecurityFlags\n"); rc = -EOPNOTSUPP; goto neg_exit; } @@ -489,7 +488,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, char *ntlmssp_blob = NULL; bool use_spnego = false; /* else use raw ntlmssp */ - cFYI(1, "Session Setup"); + cifs_dbg(FYI, "Session Setup\n"); if (ses->server) server = ses->server; @@ -522,7 +521,7 @@ ssetup_ntlmssp_authenticate: else /* if override flags set only sign/seal OR them with global auth */ sec_flags = global_secflags | ses->overrideSecFlg; - cFYI(1, "sec_flags 0x%x", sec_flags); + cifs_dbg(FYI, "sec_flags 0x%x\n", sec_flags); req->hdr.SessionId = 0; /* First session, not a reauthenticate */ req->VcNumber = 0; /* MBZ */ @@ -558,7 +557,7 @@ ssetup_ntlmssp_authenticate: sizeof(struct _NEGOTIATE_MESSAGE), ntlmssp_blob); */ /* BB eventually need to add this */ - cERROR(1, "spnego not supported for SMB2 yet"); + cifs_dbg(VFS, "spnego not supported for SMB2 yet\n"); rc = -EOPNOTSUPP; kfree(ntlmssp_blob); goto ssetup_exit; @@ -572,14 +571,14 @@ ssetup_ntlmssp_authenticate: ntlmssp_blob = kzalloc(sizeof(struct _NEGOTIATE_MESSAGE) + 500, GFP_KERNEL); if (ntlmssp_blob == NULL) { - cERROR(1, "failed to malloc ntlmssp blob"); rc = -ENOMEM; goto ssetup_exit; } rc = build_ntlmssp_auth_blob(ntlmssp_blob, &blob_length, ses, nls_cp); if (rc) { - cFYI(1, "build_ntlmssp_auth_blob failed %d", rc); + cifs_dbg(FYI, "build_ntlmssp_auth_blob failed %d\n", + rc); goto ssetup_exit; /* BB double check error handling */ } if (use_spnego) { @@ -587,7 +586,7 @@ ssetup_ntlmssp_authenticate: &security_blob, blob_length, ntlmssp_blob); */ - cERROR(1, "spnego not supported for SMB2 yet"); + cifs_dbg(VFS, "spnego not supported for SMB2 yet\n"); rc = -EOPNOTSUPP; kfree(ntlmssp_blob); goto ssetup_exit; @@ -595,7 +594,7 @@ ssetup_ntlmssp_authenticate: security_blob = ntlmssp_blob; } } else { - cERROR(1, "illegal ntlmssp phase"); + cifs_dbg(VFS, "illegal ntlmssp phase\n"); rc = -EIO; goto ssetup_exit; } @@ -620,13 +619,13 @@ ssetup_ntlmssp_authenticate: if (resp_buftype != CIFS_NO_BUFFER && rsp->hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) { if (phase != NtLmNegotiate) { - cERROR(1, "Unexpected more processing error"); + cifs_dbg(VFS, "Unexpected more processing error\n"); goto ssetup_exit; } if (offsetof(struct smb2_sess_setup_rsp, Buffer) - 4 != le16_to_cpu(rsp->SecurityBufferOffset)) { - cERROR(1, "Invalid security buffer offset %d", - le16_to_cpu(rsp->SecurityBufferOffset)); + cifs_dbg(VFS, "Invalid security buffer offset %d\n", + le16_to_cpu(rsp->SecurityBufferOffset)); rc = -EIO; goto ssetup_exit; } @@ -667,7 +666,7 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses) int rc = 0; struct TCP_Server_Info *server; - cFYI(1, "disconnect session %p", ses); + cifs_dbg(FYI, "disconnect session %p\n", ses); if (ses && (ses->server)) server = ses->server; @@ -711,7 +710,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, struct TCP_Server_Info *server; __le16 *unc_path = NULL; - cFYI(1, "TCON"); + cifs_dbg(FYI, "TCON\n"); if ((ses->server) && tree) server = ses->server; @@ -775,15 +774,15 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, } if (rsp->ShareType & SMB2_SHARE_TYPE_DISK) - cFYI(1, "connection to disk share"); + cifs_dbg(FYI, "connection to disk share\n"); else if (rsp->ShareType & SMB2_SHARE_TYPE_PIPE) { tcon->ipc = true; - cFYI(1, "connection to pipe share"); + cifs_dbg(FYI, "connection to pipe share\n"); } else if (rsp->ShareType & SMB2_SHARE_TYPE_PRINT) { tcon->print = true; - cFYI(1, "connection to printer"); + cifs_dbg(FYI, "connection to printer\n"); } else { - cERROR(1, "unknown share type %d", rsp->ShareType); + cifs_dbg(VFS, "unknown share type %d\n", rsp->ShareType); rc = -EOPNOTSUPP; goto tcon_error_exit; } @@ -797,7 +796,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) && ((tcon->share_flags & SHI1005_FLAGS_DFS) == 0)) - cERROR(1, "DFS capability contradicts DFS flag"); + cifs_dbg(VFS, "DFS capability contradicts DFS flag\n"); tcon_exit: free_rsp_buf(resp_buftype, rsp); @@ -806,7 +805,7 @@ tcon_exit: tcon_error_exit: if (rsp->hdr.Status == STATUS_BAD_NETWORK_NAME) { - cERROR(1, "BAD_NETWORK_NAME: %s", tree); + cifs_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree); tcon->bad_network_name = true; } goto tcon_exit; @@ -820,7 +819,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) struct TCP_Server_Info *server; struct cifs_ses *ses = tcon->ses; - cFYI(1, "Tree Disconnect"); + cifs_dbg(FYI, "Tree Disconnect\n"); if (ses && (ses->server)) server = ses->server; @@ -846,12 +845,10 @@ create_lease_buf(u8 *lease_key, u8 oplock) { struct create_lease *buf; - buf = kmalloc(sizeof(struct create_lease), GFP_KERNEL); + buf = kzalloc(sizeof(struct create_lease), GFP_KERNEL); if (!buf) return NULL; - memset(buf, 0, sizeof(struct create_lease)); - buf->lcontext.LeaseKeyLow = cpu_to_le64(*((u64 *)lease_key)); buf->lcontext.LeaseKeyHigh = cpu_to_le64(*((u64 *)(lease_key + 8))); if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) @@ -925,7 +922,7 @@ SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path, int rc = 0; int num_iovecs = 2; - cFYI(1, "create/open"); + cifs_dbg(FYI, "create/open\n"); if (ses && (ses->server)) server = ses->server; @@ -1051,7 +1048,7 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, int resp_buftype; int rc = 0; - cFYI(1, "Close"); + cifs_dbg(FYI, "Close\n"); if (ses && (ses->server)) server = ses->server; @@ -1097,20 +1094,20 @@ validate_buf(unsigned int offset, unsigned int buffer_length, if (buffer_length < min_buf_size) { - cERROR(1, "buffer length %d smaller than minimum size %d", - buffer_length, min_buf_size); + cifs_dbg(VFS, "buffer length %d smaller than minimum size %d\n", + buffer_length, min_buf_size); return -EINVAL; } /* check if beyond RFC1001 maximum length */ if ((smb_len > 0x7FFFFF) || (buffer_length > 0x7FFFFF)) { - cERROR(1, "buffer length %d or smb length %d too large", - buffer_length, smb_len); + cifs_dbg(VFS, "buffer length %d or smb length %d too large\n", + buffer_length, smb_len); return -EINVAL; } if ((begin_of_buf > end_of_smb) || (end_of_buf > end_of_smb)) { - cERROR(1, "illegal server response, bad offset to data"); + cifs_dbg(VFS, "illegal server response, bad offset to data\n"); return -EINVAL; } @@ -1155,7 +1152,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, struct TCP_Server_Info *server; struct cifs_ses *ses = tcon->ses; - cFYI(1, "Query Info"); + cifs_dbg(FYI, "Query Info\n"); if (ses && (ses->server)) server = ses->server; @@ -1247,7 +1244,7 @@ SMB2_echo(struct TCP_Server_Info *server) struct smb_rqst rqst = { .rq_iov = &iov, .rq_nvec = 1 }; - cFYI(1, "In echo request"); + cifs_dbg(FYI, "In echo request\n"); rc = small_smb2_init(SMB2_ECHO, NULL, (void **)&req); if (rc) @@ -1262,7 +1259,7 @@ SMB2_echo(struct TCP_Server_Info *server) rc = cifs_call_async(server, &rqst, NULL, smb2_echo_callback, server, CIFS_ECHO_OP); if (rc) - cFYI(1, "Echo request failed: %d", rc); + cifs_dbg(FYI, "Echo request failed: %d\n", rc); cifs_small_buf_release(req); return rc; @@ -1279,7 +1276,7 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, int resp_buftype; int rc = 0; - cFYI(1, "Flush"); + cifs_dbg(FYI, "Flush\n"); if (ses && (ses->server)) server = ses->server; @@ -1379,8 +1376,9 @@ smb2_readv_callback(struct mid_q_entry *mid) .rq_pagesz = rdata->pagesz, .rq_tailsz = rdata->tailsz }; - cFYI(1, "%s: mid=%llu state=%d result=%d bytes=%u", __func__, - mid->mid, mid->mid_state, rdata->result, rdata->bytes); + cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%u\n", + __func__, mid->mid, mid->mid_state, rdata->result, + rdata->bytes); switch (mid->mid_state) { case MID_RESPONSE_RECEIVED: @@ -1392,8 +1390,8 @@ smb2_readv_callback(struct mid_q_entry *mid) rc = smb2_verify_signature(&rqst, server); if (rc) - cERROR(1, "SMB signature verification returned " - "error = %d", rc); + cifs_dbg(VFS, "SMB signature verification returned error = %d\n", + rc); } /* FIXME: should this be counted toward the initiating task? */ task_io_account_read(rdata->bytes); @@ -1426,8 +1424,8 @@ smb2_async_readv(struct cifs_readdata *rdata) struct smb_rqst rqst = { .rq_iov = &rdata->iov, .rq_nvec = 1 }; - cFYI(1, "%s: offset=%llu bytes=%u", __func__, - rdata->offset, rdata->bytes); + cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n", + __func__, rdata->offset, rdata->bytes); io_parms.tcon = tlink_tcon(rdata->cfile->tlink); io_parms.offset = rdata->offset; @@ -1481,13 +1479,13 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, if (rc) { cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE); - cERROR(1, "Send error in read = %d", rc); + cifs_dbg(VFS, "Send error in read = %d\n", rc); } else { *nbytes = le32_to_cpu(rsp->DataLength); if ((*nbytes > CIFS_MAX_MSGSIZE) || (*nbytes > io_parms->length)) { - cFYI(1, "bad length %d for count %d", *nbytes, - io_parms->length); + cifs_dbg(FYI, "bad length %d for count %d\n", + *nbytes, io_parms->length); rc = -EIO; *nbytes = 0; } @@ -1597,7 +1595,8 @@ smb2_async_writev(struct cifs_writedata *wdata) rqst.rq_pagesz = wdata->pagesz; rqst.rq_tailsz = wdata->tailsz; - cFYI(1, "async write at %llu %u bytes", wdata->offset, wdata->bytes); + cifs_dbg(FYI, "async write at %llu %u bytes\n", + wdata->offset, wdata->bytes); req->Length = cpu_to_le32(wdata->bytes); @@ -1670,7 +1669,7 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, if (rc) { cifs_stats_fail_inc(io_parms->tcon, SMB2_WRITE_HE); - cERROR(1, "Send error in write = %d", rc); + cifs_dbg(VFS, "Send error in write = %d\n", rc); } else *nbytes = le32_to_cpu(rsp->DataLength); @@ -1696,14 +1695,14 @@ num_entries(char *bufstart, char *end_of_buf, char **lastentry, size_t size) ((char *)entryptr + next_offset); if ((char *)entryptr + size > end_of_buf) { - cERROR(1, "malformed search entry would overflow"); + cifs_dbg(VFS, "malformed search entry would overflow\n"); break; } len = le32_to_cpu(entryptr->FileNameLength); if ((char *)entryptr + len + size > end_of_buf) { - cERROR(1, "directory entry name would overflow frame " - "end of buf %p", end_of_buf); + cifs_dbg(VFS, "directory entry name would overflow frame end of buf %p\n", + end_of_buf); break; } @@ -1759,8 +1758,8 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, info_buf_size = sizeof(SEARCH_ID_FULL_DIR_INFO) - 1; break; default: - cERROR(1, "info level %u isn't supported", - srch_inf->info_level); + cifs_dbg(VFS, "info level %u isn't supported\n", + srch_inf->info_level); rc = -EINVAL; goto qdir_exit; } @@ -1824,15 +1823,15 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, num_entries(srch_inf->srch_entries_start, end_of_smb, &srch_inf->last_entry, info_buf_size); srch_inf->index_of_last_entry += srch_inf->entries_in_buffer; - cFYI(1, "num entries %d last_index %lld srch start %p srch end %p", - srch_inf->entries_in_buffer, srch_inf->index_of_last_entry, - srch_inf->srch_entries_start, srch_inf->last_entry); + cifs_dbg(FYI, "num entries %d last_index %lld srch start %p srch end %p\n", + srch_inf->entries_in_buffer, srch_inf->index_of_last_entry, + srch_inf->srch_entries_start, srch_inf->last_entry); if (resp_buftype == CIFS_LARGE_BUFFER) srch_inf->smallBuf = false; else if (resp_buftype == CIFS_SMALL_BUFFER) srch_inf->smallBuf = true; else - cERROR(1, "illegal search buffer type"); + cifs_dbg(VFS, "illegal search buffer type\n"); if (rsp->hdr.Status == STATUS_NO_MORE_FILES) srch_inf->endOfSearch = 1; @@ -2017,7 +2016,7 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, int rc; struct smb2_oplock_break *req = NULL; - cFYI(1, "SMB2_oplock_break"); + cifs_dbg(FYI, "SMB2_oplock_break\n"); rc = small_smb2_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req); if (rc) @@ -2033,7 +2032,7 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, if (rc) { cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE); - cFYI(1, "Send error in Oplock Break = %d", rc); + cifs_dbg(FYI, "Send error in Oplock Break = %d\n", rc); } return rc; @@ -2058,7 +2057,7 @@ build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level, int rc; struct smb2_query_info_req *req; - cFYI(1, "Query FSInfo level %d", level); + cifs_dbg(FYI, "Query FSInfo level %d\n", level); if ((tcon->ses == NULL) || (tcon->ses->server == NULL)) return -EIO; @@ -2131,7 +2130,7 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, int resp_buf_type; unsigned int count; - cFYI(1, "smb2_lockv num lock %d", num_lock); + cifs_dbg(FYI, "smb2_lockv num lock %d\n", num_lock); rc = small_smb2_init(SMB2_LOCK, tcon, (void **) &req); if (rc) @@ -2155,7 +2154,7 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, cifs_stats_inc(&tcon->stats.cifs_stats.num_locks); rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, CIFS_NO_RESP); if (rc) { - cFYI(1, "Send error in smb2_lockv = %d", rc); + cifs_dbg(FYI, "Send error in smb2_lockv = %d\n", rc); cifs_stats_fail_inc(tcon, SMB2_LOCK_HE); } @@ -2186,7 +2185,7 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, int rc; struct smb2_lease_ack *req = NULL; - cFYI(1, "SMB2_lease_break"); + cifs_dbg(FYI, "SMB2_lease_break\n"); rc = small_smb2_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req); if (rc) @@ -2204,7 +2203,7 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, if (rc) { cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE); - cFYI(1, "Send error in Lease Break = %d", rc); + cifs_dbg(FYI, "Send error in Lease Break = %d\n", rc); } return rc; diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 8dd73e61d762..01f0ac800780 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -55,13 +55,13 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) rc = crypto_shash_setkey(server->secmech.hmacsha256, server->session_key.response, SMB2_NTLMV2_SESSKEY_SIZE); if (rc) { - cERROR(1, "%s: Could not update with response\n", __func__); + cifs_dbg(VFS, "%s: Could not update with response\n", __func__); return rc; } rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash); if (rc) { - cERROR(1, "%s: Could not init md5\n", __func__); + cifs_dbg(VFS, "%s: Could not init md5\n", __func__); return rc; } @@ -69,7 +69,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) if (iov[i].iov_len == 0) continue; if (iov[i].iov_base == NULL) { - cERROR(1, "null iovec entry"); + cifs_dbg(VFS, "null iovec entry\n"); return -EIO; } /* @@ -90,8 +90,8 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) iov[i].iov_base, iov[i].iov_len); } if (rc) { - cERROR(1, "%s: Could not update with payload\n", - __func__); + cifs_dbg(VFS, "%s: Could not update with payload\n", + __func__); return rc; } } @@ -109,7 +109,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) rc = crypto_shash_final(&server->secmech.sdeschmacsha256->shash, sigptr); if (rc) - cERROR(1, "%s: Could not generate sha256 hash\n", __func__); + cifs_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__); memcpy(smb2_pdu->Signature, sigptr, SMB2_SIGNATURE_SIZE); @@ -119,7 +119,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) int smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) { - cFYI(1, "smb3 signatures not supported yet"); + cifs_dbg(FYI, "smb3 signatures not supported yet\n"); return -EOPNOTSUPP; } @@ -163,8 +163,8 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) /* Do not need to verify session setups with signature "BSRSPYL " */ if (memcmp(smb2_pdu->Signature, "BSRSPYL ", 8) == 0) - cFYI(1, "dummy signature received for smb command 0x%x", - smb2_pdu->Command); + cifs_dbg(FYI, "dummy signature received for smb command 0x%x\n", + smb2_pdu->Command); /* * Save off the origiginal signature so we can modify the smb and check @@ -205,7 +205,7 @@ smb2_mid_entry_alloc(const struct smb2_hdr *smb_buffer, struct mid_q_entry *temp; if (server == NULL) { - cERROR(1, "Null TCP session in smb2_mid_entry_alloc"); + cifs_dbg(VFS, "Null TCP session in smb2_mid_entry_alloc\n"); return NULL; } @@ -241,7 +241,7 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_hdr *buf, return -ENOENT; if (ses->server->tcpStatus == CifsNeedReconnect) { - cFYI(1, "tcp session dead - return to caller to retry"); + cifs_dbg(FYI, "tcp session dead - return to caller to retry\n"); return -EAGAIN; } @@ -281,8 +281,8 @@ smb2_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, rc = smb2_verify_signature(&rqst, server); if (rc) - cERROR(1, "SMB signature verification returned error = " - "%d", rc); + cifs_dbg(VFS, "SMB signature verification returned error = %d\n", + rc); } return map_smb2_to_linux_error(mid->resp_buf, log_error); diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c index a0a58fbe2c10..43eb1367b103 100644 --- a/fs/cifs/smbencrypt.c +++ b/fs/cifs/smbencrypt.c @@ -78,7 +78,7 @@ smbhash(unsigned char *out, const unsigned char *in, unsigned char *key) tfm_des = crypto_alloc_blkcipher("ecb(des)", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm_des)) { rc = PTR_ERR(tfm_des); - cERROR(1, "could not allocate des crypto API"); + cifs_dbg(VFS, "could not allocate des crypto API\n"); goto smbhash_err; } @@ -91,7 +91,7 @@ smbhash(unsigned char *out, const unsigned char *in, unsigned char *key) rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, 8); if (rc) - cERROR(1, "could not encrypt crypt key rc: %d", rc); + cifs_dbg(VFS, "could not encrypt crypt key rc: %d\n", rc); crypto_free_blkcipher(tfm_des); smbhash_err: @@ -139,14 +139,14 @@ mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len) md4 = crypto_alloc_shash("md4", 0, 0); if (IS_ERR(md4)) { rc = PTR_ERR(md4); - cERROR(1, "%s: Crypto md4 allocation error %d", __func__, rc); + cifs_dbg(VFS, "%s: Crypto md4 allocation error %d\n", + __func__, rc); return rc; } size = sizeof(struct shash_desc) + crypto_shash_descsize(md4); sdescmd4 = kmalloc(size, GFP_KERNEL); if (!sdescmd4) { rc = -ENOMEM; - cERROR(1, "%s: Memory allocation failure", __func__); goto mdfour_err; } sdescmd4->shash.tfm = md4; @@ -154,17 +154,17 @@ mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len) rc = crypto_shash_init(&sdescmd4->shash); if (rc) { - cERROR(1, "%s: Could not init md4 shash", __func__); + cifs_dbg(VFS, "%s: Could not init md4 shash\n", __func__); goto mdfour_err; } rc = crypto_shash_update(&sdescmd4->shash, link_str, link_len); if (rc) { - cERROR(1, "%s: Could not update with link_str", __func__); + cifs_dbg(VFS, "%s: Could not update with link_str\n", __func__); goto mdfour_err; } rc = crypto_shash_final(&sdescmd4->shash, md4_hash); if (rc) - cERROR(1, "%s: Could not genereate md4 hash", __func__); + cifs_dbg(VFS, "%s: Could not generate md4 hash\n", __func__); mdfour_err: crypto_free_shash(md4); @@ -238,7 +238,8 @@ SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24, rc = E_md4hash(passwd, p16, codepage); if (rc) { - cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc); + cifs_dbg(FYI, "%s Can't generate NT hash, error: %d\n", + __func__, rc); return rc; } memcpy(p21, p16, 16); diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 1a528680ec5a..bfbf4700d160 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -49,7 +49,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) struct mid_q_entry *temp; if (server == NULL) { - cERROR(1, "Null TCP session in AllocMidQEntry"); + cifs_dbg(VFS, "Null TCP session in AllocMidQEntry\n"); return NULL; } @@ -61,7 +61,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) temp->mid = smb_buffer->Mid; /* always LE */ temp->pid = current->pid; temp->command = cpu_to_le16(smb_buffer->Command); - cFYI(1, "For smb_command %d", smb_buffer->Command); + cifs_dbg(FYI, "For smb_command %d\n", smb_buffer->Command); /* do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */ /* when mid allocated can be before when sent */ temp->when_alloc = jiffies; @@ -179,17 +179,11 @@ smb_send_kvec(struct TCP_Server_Info *server, struct kvec *iov, size_t n_vec, */ rc = kernel_sendmsg(ssocket, &smb_msg, &iov[first_vec], n_vec - first_vec, remaining); - if (rc == -ENOSPC || rc == -EAGAIN) { - /* - * Catch if a low level driver returns -ENOSPC. This - * WARN_ON will be removed by 3.10 if no one reports - * seeing this. - */ - WARN_ON_ONCE(rc == -ENOSPC); + if (rc == -EAGAIN) { i++; if (i >= 14 || (!server->noblocksnd && (i > 2))) { - cERROR(1, "sends on sock %p stuck for 15 " - "seconds", ssocket); + cifs_dbg(VFS, "sends on sock %p stuck for 15 seconds\n", + ssocket); rc = -EAGAIN; break; } @@ -209,14 +203,14 @@ smb_send_kvec(struct TCP_Server_Info *server, struct kvec *iov, size_t n_vec, } if (rc > remaining) { - cERROR(1, "sent %d requested %d", rc, remaining); + cifs_dbg(VFS, "sent %d requested %d\n", rc, remaining); break; } if (rc == 0) { /* should never happen, letting socket clear before retrying is our only obvious option here */ - cERROR(1, "tcp sent no data"); + cifs_dbg(VFS, "tcp sent no data\n"); msleep(500); continue; } @@ -291,7 +285,7 @@ smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst) if (ssocket == NULL) return -ENOTSOCK; - cFYI(1, "Sending smb: smb_len=%u", smb_buf_length); + cifs_dbg(FYI, "Sending smb: smb_len=%u\n", smb_buf_length); dump_smb(iov[0].iov_base, iov[0].iov_len); /* cork the socket */ @@ -324,8 +318,8 @@ uncork: (char *)&val, sizeof(val)); if ((total_len > 0) && (total_len != smb_buf_length + 4)) { - cFYI(1, "partial send (wanted=%u sent=%zu): terminating " - "session", smb_buf_length + 4, total_len); + cifs_dbg(FYI, "partial send (wanted=%u sent=%zu): terminating session\n", + smb_buf_length + 4, total_len); /* * If we have only sent part of an SMB then the next SMB could * be taken as the remainder of this one. We need to kill the @@ -335,7 +329,8 @@ uncork: } if (rc < 0 && rc != -EINTR) - cERROR(1, "Error %d sending data on socket to server", rc); + cifs_dbg(VFS, "Error %d sending data on socket to server\n", + rc); else rc = 0; @@ -427,7 +422,7 @@ static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf, } if (ses->server->tcpStatus == CifsNeedReconnect) { - cFYI(1, "tcp session dead - return to caller to retry"); + cifs_dbg(FYI, "tcp session dead - return to caller to retry\n"); return -EAGAIN; } @@ -527,6 +522,9 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, rc = smb_send_rqst(server, rqst); cifs_in_send_dec(server); cifs_save_when_sent(mid); + + if (rc < 0) + server->sequence_number -= 2; mutex_unlock(&server->srv_mutex); if (rc == 0) @@ -559,7 +557,7 @@ SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses, iov[0].iov_len = get_rfc1002_length(in_buf) + 4; flags |= CIFS_NO_RESP; rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags); - cFYI(DBG2, "SendRcvNoRsp flags %d rc %d", flags, rc); + cifs_dbg(NOISY, "SendRcvNoRsp flags %d rc %d\n", flags, rc); return rc; } @@ -569,8 +567,8 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server) { int rc = 0; - cFYI(1, "%s: cmd=%d mid=%llu state=%d", __func__, - le16_to_cpu(mid->command), mid->mid, mid->mid_state); + cifs_dbg(FYI, "%s: cmd=%d mid=%llu state=%d\n", + __func__, le16_to_cpu(mid->command), mid->mid, mid->mid_state); spin_lock(&GlobalMid_Lock); switch (mid->mid_state) { @@ -588,8 +586,8 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server) break; default: list_del_init(&mid->qhead); - cERROR(1, "%s: invalid mid state mid=%llu state=%d", __func__, - mid->mid, mid->mid_state); + cifs_dbg(VFS, "%s: invalid mid state mid=%llu state=%d\n", + __func__, mid->mid, mid->mid_state); rc = -EIO; } spin_unlock(&GlobalMid_Lock); @@ -624,10 +622,10 @@ cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, iov.iov_len = len; /* FIXME: add code to kill session */ rc = cifs_verify_signature(&rqst, server, - mid->sequence_number + 1); + mid->sequence_number); if (rc) - cERROR(1, "SMB signature verification returned error = " - "%d", rc); + cifs_dbg(VFS, "SMB signature verification returned error = %d\n", + rc); } /* BB special case reconnect tid and uid here? */ @@ -672,7 +670,7 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, if ((ses == NULL) || (ses->server == NULL)) { cifs_small_buf_release(buf); - cERROR(1, "Null session"); + cifs_dbg(VFS, "Null session\n"); return -EIO; } @@ -716,6 +714,8 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, cifs_in_send_dec(ses->server); cifs_save_when_sent(midQ); + if (rc < 0) + ses->server->sequence_number -= 2; mutex_unlock(&ses->server->srv_mutex); if (rc < 0) { @@ -752,7 +752,7 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, if (!midQ->resp_buf || midQ->mid_state != MID_RESPONSE_RECEIVED) { rc = -EIO; - cFYI(1, "Bad MID state?"); + cifs_dbg(FYI, "Bad MID state?\n"); goto out; } @@ -788,11 +788,11 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, struct mid_q_entry *midQ; if (ses == NULL) { - cERROR(1, "Null smb session"); + cifs_dbg(VFS, "Null smb session\n"); return -EIO; } if (ses->server == NULL) { - cERROR(1, "Null tcp session"); + cifs_dbg(VFS, "Null tcp session\n"); return -EIO; } @@ -805,8 +805,8 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, if (be32_to_cpu(in_buf->smb_buf_length) > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { - cERROR(1, "Illegal length, greater than maximum frame, %d", - be32_to_cpu(in_buf->smb_buf_length)); + cifs_dbg(VFS, "Illegal length, greater than maximum frame, %d\n", + be32_to_cpu(in_buf->smb_buf_length)); return -EIO; } @@ -840,6 +840,10 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length)); cifs_in_send_dec(ses->server); cifs_save_when_sent(midQ); + + if (rc < 0) + ses->server->sequence_number -= 2; + mutex_unlock(&ses->server->srv_mutex); if (rc < 0) @@ -871,7 +875,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, if (!midQ->resp_buf || !out_buf || midQ->mid_state != MID_RESPONSE_RECEIVED) { rc = -EIO; - cERROR(1, "Bad MID state?"); + cifs_dbg(VFS, "Bad MID state?\n"); goto out; } @@ -921,13 +925,13 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_ses *ses; if (tcon == NULL || tcon->ses == NULL) { - cERROR(1, "Null smb session"); + cifs_dbg(VFS, "Null smb session\n"); return -EIO; } ses = tcon->ses; if (ses->server == NULL) { - cERROR(1, "Null tcp session"); + cifs_dbg(VFS, "Null tcp session\n"); return -EIO; } @@ -940,8 +944,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, if (be32_to_cpu(in_buf->smb_buf_length) > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { - cERROR(1, "Illegal length, greater than maximum frame, %d", - be32_to_cpu(in_buf->smb_buf_length)); + cifs_dbg(VFS, "Illegal length, greater than maximum frame, %d\n", + be32_to_cpu(in_buf->smb_buf_length)); return -EIO; } @@ -973,6 +977,10 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length)); cifs_in_send_dec(ses->server); cifs_save_when_sent(midQ); + + if (rc < 0) + ses->server->sequence_number -= 2; + mutex_unlock(&ses->server->srv_mutex); if (rc < 0) { @@ -1038,7 +1046,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, /* rcvd frame is ok */ if (out_buf == NULL || midQ->mid_state != MID_RESPONSE_RECEIVED) { rc = -EIO; - cERROR(1, "Bad MID state?"); + cifs_dbg(VFS, "Bad MID state?\n"); goto out; } diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 5142f2c60278..09afda4cc58e 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -68,12 +68,12 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name) goto remove_ea_exit; } if (ea_name == NULL) { - cFYI(1, "Null xattr names not supported"); + cifs_dbg(FYI, "Null xattr names not supported\n"); } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) && (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN))) { - cFYI(1, - "illegal xattr request %s (only user namespace supported)", - ea_name); + cifs_dbg(FYI, + "illegal xattr request %s (only user namespace supported)\n", + ea_name); /* BB what if no namespace prefix? */ /* Should we just pass them to server, except for system and perhaps security prefixes? */ @@ -134,19 +134,19 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, search server for EAs or streams to returns as xattrs */ if (value_size > MAX_EA_VALUE_SIZE) { - cFYI(1, "size of EA value too large"); + cifs_dbg(FYI, "size of EA value too large\n"); rc = -EOPNOTSUPP; goto set_ea_exit; } if (ea_name == NULL) { - cFYI(1, "Null xattr names not supported"); + cifs_dbg(FYI, "Null xattr names not supported\n"); } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) == 0) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto set_ea_exit; if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) - cFYI(1, "attempt to set cifs inode metadata"); + cifs_dbg(FYI, "attempt to set cifs inode metadata\n"); ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */ rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value, @@ -167,8 +167,6 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, struct cifs_ntsd *pacl; pacl = kmalloc(value_size, GFP_KERNEL); if (!pacl) { - cFYI(1, "%s: Can't allocate memory for ACL", - __func__); rc = -ENOMEM; } else { memcpy(pacl, ea_value, value_size); @@ -179,7 +177,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, kfree(pacl); } #else - cFYI(1, "Set CIFS ACL not supported yet"); + cifs_dbg(FYI, "Set CIFS ACL not supported yet\n"); #endif /* CONFIG_CIFS_ACL */ } else { int temp; @@ -193,9 +191,9 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, ACL_TYPE_ACCESS, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - cFYI(1, "set POSIX ACL rc %d", rc); + cifs_dbg(FYI, "set POSIX ACL rc %d\n", rc); #else - cFYI(1, "set POSIX ACL not supported"); + cifs_dbg(FYI, "set POSIX ACL not supported\n"); #endif } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT, strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) { @@ -206,13 +204,13 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, ACL_TYPE_DEFAULT, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - cFYI(1, "set POSIX default ACL rc %d", rc); + cifs_dbg(FYI, "set POSIX default ACL rc %d\n", rc); #else - cFYI(1, "set default POSIX ACL not supported"); + cifs_dbg(FYI, "set default POSIX ACL not supported\n"); #endif } else { - cFYI(1, "illegal xattr request %s (only user namespace" - " supported)", ea_name); + cifs_dbg(FYI, "illegal xattr request %s (only user namespace supported)\n", + ea_name); /* BB what if no namespace prefix? */ /* Should we just pass them to server, except for system and perhaps security prefixes? */ @@ -263,14 +261,14 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, /* return dos attributes as pseudo xattr */ /* return alt name if available as pseudo attr */ if (ea_name == NULL) { - cFYI(1, "Null xattr names not supported"); + cifs_dbg(FYI, "Null xattr names not supported\n"); } else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) == 0) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto get_ea_exit; if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) { - cFYI(1, "attempt to query cifs inode metadata"); + cifs_dbg(FYI, "attempt to query cifs inode metadata\n"); /* revalidate/getattr then populate from inode */ } /* BB add else when above is implemented */ ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */ @@ -295,7 +293,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); #else - cFYI(1, "Query POSIX ACL not supported yet"); + cifs_dbg(FYI, "Query POSIX ACL not supported yet\n"); #endif /* CONFIG_CIFS_POSIX */ } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT, strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) { @@ -307,7 +305,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); #else - cFYI(1, "Query POSIX default ACL not supported yet"); + cifs_dbg(FYI, "Query POSIX default ACL not supported yet\n"); #endif /* CONFIG_CIFS_POSIX */ } else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL, strlen(CIFS_XATTR_CIFS_ACL)) == 0) { @@ -319,8 +317,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, full_path, &acllen); if (IS_ERR(pacl)) { rc = PTR_ERR(pacl); - cERROR(1, "%s: error %zd getting sec desc", - __func__, rc); + cifs_dbg(VFS, "%s: error %zd getting sec desc\n", + __func__, rc); } else { if (ea_value) { if (acllen > buf_size) @@ -332,18 +330,18 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, kfree(pacl); } #else - cFYI(1, "Query CIFS ACL not supported yet"); + cifs_dbg(FYI, "Query CIFS ACL not supported yet\n"); #endif /* CONFIG_CIFS_ACL */ } else if (strncmp(ea_name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) { - cFYI(1, "Trusted xattr namespace not supported yet"); + cifs_dbg(FYI, "Trusted xattr namespace not supported yet\n"); } else if (strncmp(ea_name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) { - cFYI(1, "Security xattr namespace not supported yet"); + cifs_dbg(FYI, "Security xattr namespace not supported yet\n"); } else - cFYI(1, - "illegal xattr request %s (only user namespace supported)", - ea_name); + cifs_dbg(FYI, + "illegal xattr request %s (only user namespace supported)\n", + ea_name); /* We could add an additional check for streams ie if proc/fs/cifs/streamstoxattr is set then diff --git a/fs/coda/file.c b/fs/coda/file.c index fa4c100bdc7d..380b798f8443 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -79,6 +79,7 @@ coda_file_write(struct file *coda_file, const char __user *buf, size_t count, lo return -EINVAL; host_inode = file_inode(host_file); + file_start_write(host_file); mutex_lock(&coda_inode->i_mutex); ret = host_file->f_op->write(host_file, buf, count, ppos); @@ -87,6 +88,7 @@ coda_file_write(struct file *coda_file, const char __user *buf, size_t count, lo coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9; coda_inode->i_mtime = coda_inode->i_ctime = CURRENT_TIME_SEC; mutex_unlock(&coda_inode->i_mutex); + file_end_write(host_file); return ret; } diff --git a/fs/coda/inode.c b/fs/coda/inode.c index dada9d0abede..4dcc0d81a7aa 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -329,4 +329,5 @@ struct file_system_type coda_fs_type = { .kill_sb = kill_anon_super, .fs_flags = FS_BINARY_MOUNTDATA, }; +MODULE_ALIAS_FS("coda"); diff --git a/fs/compat.c b/fs/compat.c index fe40fde29111..fc3b55dce184 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -44,10 +44,10 @@ #include <linux/signal.h> #include <linux/poll.h> #include <linux/mm.h> -#include <linux/eventpoll.h> #include <linux/fs_struct.h> #include <linux/slab.h> #include <linux/pagemap.h> +#include <linux/aio.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -68,8 +68,6 @@ int compat_printk(const char *fmt, ...) return ret; } -#include "read_write.h" - /* * Not all architectures have sys_utime, so implement this in terms * of sys_utimes. @@ -558,6 +556,10 @@ ssize_t compat_rw_copy_check_uvector(int type, } *ret_pointer = iov; + ret = -EFAULT; + if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) + goto out; + /* * Single unix specification: * We should -EINVAL if an element length is not >= 0 and fitting an @@ -1065,215 +1067,6 @@ asmlinkage long compat_sys_getdents64(unsigned int fd, } #endif /* ! __ARCH_OMIT_COMPAT_SYS_GETDENTS64 */ -static ssize_t compat_do_readv_writev(int type, struct file *file, - const struct compat_iovec __user *uvector, - unsigned long nr_segs, loff_t *pos) -{ - compat_ssize_t tot_len; - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov = iovstack; - ssize_t ret; - io_fn_t fn; - iov_fn_t fnv; - - ret = -EINVAL; - if (!file->f_op) - goto out; - - ret = -EFAULT; - if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) - goto out; - - tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs, - UIO_FASTIOV, iovstack, &iov); - if (tot_len == 0) { - ret = 0; - goto out; - } - - ret = rw_verify_area(type, file, pos, tot_len); - if (ret < 0) - goto out; - - fnv = NULL; - if (type == READ) { - fn = file->f_op->read; - fnv = file->f_op->aio_read; - } else { - fn = (io_fn_t)file->f_op->write; - fnv = file->f_op->aio_write; - } - - if (fnv) - ret = do_sync_readv_writev(file, iov, nr_segs, tot_len, - pos, fnv); - else - ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn); - -out: - if (iov != iovstack) - kfree(iov); - if ((ret + (type == READ)) > 0) { - if (type == READ) - fsnotify_access(file); - else - fsnotify_modify(file); - } - return ret; -} - -static size_t compat_readv(struct file *file, - const struct compat_iovec __user *vec, - unsigned long vlen, loff_t *pos) -{ - ssize_t ret = -EBADF; - - if (!(file->f_mode & FMODE_READ)) - goto out; - - ret = -EINVAL; - if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read)) - goto out; - - ret = compat_do_readv_writev(READ, file, vec, vlen, pos); - -out: - if (ret > 0) - add_rchar(current, ret); - inc_syscr(current); - return ret; -} - -asmlinkage ssize_t -compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, - unsigned long vlen) -{ - struct fd f = fdget(fd); - ssize_t ret; - loff_t pos; - - if (!f.file) - return -EBADF; - pos = f.file->f_pos; - ret = compat_readv(f.file, vec, vlen, &pos); - f.file->f_pos = pos; - fdput(f); - return ret; -} - -asmlinkage ssize_t -compat_sys_preadv64(unsigned long fd, const struct compat_iovec __user *vec, - unsigned long vlen, loff_t pos) -{ - struct fd f; - ssize_t ret; - - if (pos < 0) - return -EINVAL; - f = fdget(fd); - if (!f.file) - return -EBADF; - ret = -ESPIPE; - if (f.file->f_mode & FMODE_PREAD) - ret = compat_readv(f.file, vec, vlen, &pos); - fdput(f); - return ret; -} - -asmlinkage ssize_t -compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec, - unsigned long vlen, u32 pos_low, u32 pos_high) -{ - loff_t pos = ((loff_t)pos_high << 32) | pos_low; - return compat_sys_preadv64(fd, vec, vlen, pos); -} - -static size_t compat_writev(struct file *file, - const struct compat_iovec __user *vec, - unsigned long vlen, loff_t *pos) -{ - ssize_t ret = -EBADF; - - if (!(file->f_mode & FMODE_WRITE)) - goto out; - - ret = -EINVAL; - if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write)) - goto out; - - ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos); - -out: - if (ret > 0) - add_wchar(current, ret); - inc_syscw(current); - return ret; -} - -asmlinkage ssize_t -compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, - unsigned long vlen) -{ - struct fd f = fdget(fd); - ssize_t ret; - loff_t pos; - - if (!f.file) - return -EBADF; - pos = f.file->f_pos; - ret = compat_writev(f.file, vec, vlen, &pos); - f.file->f_pos = pos; - fdput(f); - return ret; -} - -asmlinkage ssize_t -compat_sys_pwritev64(unsigned long fd, const struct compat_iovec __user *vec, - unsigned long vlen, loff_t pos) -{ - struct fd f; - ssize_t ret; - - if (pos < 0) - return -EINVAL; - f = fdget(fd); - if (!f.file) - return -EBADF; - ret = -ESPIPE; - if (f.file->f_mode & FMODE_PWRITE) - ret = compat_writev(f.file, vec, vlen, &pos); - fdput(f); - return ret; -} - -asmlinkage ssize_t -compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec, - unsigned long vlen, u32 pos_low, u32 pos_high) -{ - loff_t pos = ((loff_t)pos_high << 32) | pos_low; - return compat_sys_pwritev64(fd, vec, vlen, pos); -} - -asmlinkage long -compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32, - unsigned int nr_segs, unsigned int flags) -{ - unsigned i; - struct iovec __user *iov; - if (nr_segs > UIO_MAXIOV) - return -EINVAL; - iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec)); - for (i = 0; i < nr_segs; i++) { - struct compat_iovec v; - if (get_user(v.iov_base, &iov32[i].iov_base) || - get_user(v.iov_len, &iov32[i].iov_len) || - put_user(compat_ptr(v.iov_base), &iov[i].iov_base) || - put_user(v.iov_len, &iov[i].iov_len)) - return -EFAULT; - } - return sys_vmsplice(fd, iov, nr_segs, flags); -} - /* * Exactly like fs/open.c:sys_open(), except that it doesn't set the * O_LARGEFILE flag. @@ -1659,84 +1452,6 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds, return ret; } -#ifdef CONFIG_EPOLL - -asmlinkage long compat_sys_epoll_pwait(int epfd, - struct compat_epoll_event __user *events, - int maxevents, int timeout, - const compat_sigset_t __user *sigmask, - compat_size_t sigsetsize) -{ - long err; - compat_sigset_t csigmask; - sigset_t ksigmask, sigsaved; - - /* - * If the caller wants a certain signal mask to be set during the wait, - * we apply it here. - */ - if (sigmask) { - if (sigsetsize != sizeof(compat_sigset_t)) - return -EINVAL; - if (copy_from_user(&csigmask, sigmask, sizeof(csigmask))) - return -EFAULT; - sigset_from_compat(&ksigmask, &csigmask); - sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); - sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); - } - - err = sys_epoll_wait(epfd, events, maxevents, timeout); - - /* - * If we changed the signal mask, we need to restore the original one. - * In case we've got a signal while waiting, we do not restore the - * signal mask yet, and we allow do_signal() to deliver the signal on - * the way back to userspace, before the signal mask is restored. - */ - if (sigmask) { - if (err == -EINTR) { - memcpy(¤t->saved_sigmask, &sigsaved, - sizeof(sigsaved)); - set_restore_sigmask(); - } else - sigprocmask(SIG_SETMASK, &sigsaved, NULL); - } - - return err; -} - -#endif /* CONFIG_EPOLL */ - -#ifdef CONFIG_SIGNALFD - -asmlinkage long compat_sys_signalfd4(int ufd, - const compat_sigset_t __user *sigmask, - compat_size_t sigsetsize, int flags) -{ - compat_sigset_t ss32; - sigset_t tmp; - sigset_t __user *ksigmask; - - if (sigsetsize != sizeof(compat_sigset_t)) - return -EINVAL; - if (copy_from_user(&ss32, sigmask, sizeof(ss32))) - return -EFAULT; - sigset_from_compat(&tmp, &ss32); - ksigmask = compat_alloc_user_space(sizeof(sigset_t)); - if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t))) - return -EFAULT; - - return sys_signalfd4(ufd, ksigmask, sizeof(sigset_t), flags); -} - -asmlinkage long compat_sys_signalfd(int ufd, - const compat_sigset_t __user *sigmask, - compat_size_t sigsetsize) -{ - return compat_sys_signalfd4(ufd, sigmask, sigsetsize, 0); -} -#endif /* CONFIG_SIGNALFD */ - #ifdef CONFIG_FHANDLE /* * Exactly like fs/open.c:sys_open_by_handle_at(), except that it @@ -1748,25 +1463,3 @@ COMPAT_SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd, return do_handle_open(mountdirfd, handle, flags); } #endif - -#ifdef __ARCH_WANT_COMPAT_SYS_SENDFILE -asmlinkage long compat_sys_sendfile(int out_fd, int in_fd, - compat_off_t __user *offset, compat_size_t count) -{ - loff_t pos; - off_t off; - ssize_t ret; - - if (offset) { - if (unlikely(get_user(off, offset))) - return -EFAULT; - pos = off; - ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); - if (unlikely(put_user(pos, offset))) - return -EFAULT; - return ret; - } - - return do_sendfile(out_fd, in_fd, NULL, count, 0); -} -#endif /* __ARCH_WANT_COMPAT_SYS_SENDFILE */ diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 3ced75f765ca..996cdc5abb85 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -608,7 +608,6 @@ struct serial_struct32 { static int serial_struct_ioctl(unsigned fd, unsigned cmd, struct serial_struct32 __user *ss32) { - typedef struct serial_struct SS; typedef struct serial_struct32 SS32; int err; struct serial_struct ss; diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index aee0a7ebbd8e..7f26c3cf75ae 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -114,6 +114,7 @@ static struct file_system_type configfs_fs_type = { .mount = configfs_do_mount, .kill_sb = kill_litter_super, }; +MODULE_ALIAS_FS("configfs"); struct dentry *configfs_pin_fs(void) { diff --git a/fs/coredump.c b/fs/coredump.c index c6479658d487..dafafbafa731 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -263,7 +263,6 @@ static int zap_process(struct task_struct *start, int exit_code) struct task_struct *t; int nr = 0; - start->signal->flags = SIGNAL_GROUP_EXIT; start->signal->group_exit_code = exit_code; start->signal->group_stop_count = 0; @@ -280,8 +279,8 @@ static int zap_process(struct task_struct *start, int exit_code) return nr; } -static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, - struct core_state *core_state, int exit_code) +static int zap_threads(struct task_struct *tsk, struct mm_struct *mm, + struct core_state *core_state, int exit_code) { struct task_struct *g, *p; unsigned long flags; @@ -291,11 +290,16 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, if (!signal_group_exit(tsk->signal)) { mm->core_state = core_state; nr = zap_process(tsk, exit_code); + tsk->signal->group_exit_task = tsk; + /* ignore all signals except SIGKILL, see prepare_signal() */ + tsk->signal->flags = SIGNAL_GROUP_COREDUMP; + clear_tsk_thread_flag(tsk, TIF_SIGPENDING); } spin_unlock_irq(&tsk->sighand->siglock); if (unlikely(nr < 0)) return nr; + tsk->flags = PF_DUMPCORE; if (atomic_read(&mm->mm_users) == nr + 1) goto done; /* @@ -340,6 +344,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, if (unlikely(p->mm == mm)) { lock_task_sighand(p, &flags); nr += zap_process(p, exit_code); + p->signal->flags = SIGNAL_GROUP_EXIT; unlock_task_sighand(p, &flags); } break; @@ -386,11 +391,18 @@ static int coredump_wait(int exit_code, struct core_state *core_state) return core_waiters; } -static void coredump_finish(struct mm_struct *mm) +static void coredump_finish(struct mm_struct *mm, bool core_dumped) { struct core_thread *curr, *next; struct task_struct *task; + spin_lock_irq(¤t->sighand->siglock); + if (core_dumped && !__fatal_signal_pending(current)) + current->signal->group_exit_code |= 0x80; + current->signal->group_exit_task = NULL; + current->signal->flags = SIGNAL_GROUP_EXIT; + spin_unlock_irq(¤t->sighand->siglock); + next = mm->core_state->dumper.next; while ((curr = next) != NULL) { next = curr->next; @@ -407,26 +419,38 @@ static void coredump_finish(struct mm_struct *mm) mm->core_state = NULL; } -static void wait_for_dump_helpers(struct file *file) +static bool dump_interrupted(void) { - struct pipe_inode_info *pipe; + /* + * SIGKILL or freezing() interrupt the coredumping. Perhaps we + * can do try_to_freeze() and check __fatal_signal_pending(), + * but then we need to teach dump_write() to restart and clear + * TIF_SIGPENDING. + */ + return signal_pending(current); +} - pipe = file_inode(file)->i_pipe; +static void wait_for_dump_helpers(struct file *file) +{ + struct pipe_inode_info *pipe = file->private_data; pipe_lock(pipe); pipe->readers++; pipe->writers--; + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + pipe_unlock(pipe); - while ((pipe->readers > 1) && (!signal_pending(current))) { - wake_up_interruptible_sync(&pipe->wait); - kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); - pipe_wait(pipe); - } + /* + * We actually want wait_event_freezable() but then we need + * to clear TIF_SIGPENDING and improve dump_interrupted(). + */ + wait_event_interruptible(pipe->wait, pipe->readers == 1); + pipe_lock(pipe); pipe->readers--; pipe->writers++; pipe_unlock(pipe); - } /* @@ -471,6 +495,7 @@ void do_coredump(siginfo_t *siginfo) int ispipe; struct files_struct *displaced; bool need_nonrelative = false; + bool core_dumped = false; static atomic_t core_dump_count = ATOMIC_INIT(0); struct coredump_params cprm = { .siginfo = siginfo, @@ -514,17 +539,12 @@ void do_coredump(siginfo_t *siginfo) old_cred = override_creds(cred); - /* - * Clear any false indication of pending signals that might - * be seen by the filesystem code called to write the core file. - */ - clear_thread_flag(TIF_SIGPENDING); - ispipe = format_corename(&cn, &cprm); - if (ispipe) { + if (ispipe) { int dump_count; char **helper_argv; + struct subprocess_info *sub_info; if (ispipe < 0) { printk(KERN_WARNING "format_corename failed\n"); @@ -571,15 +591,20 @@ void do_coredump(siginfo_t *siginfo) goto fail_dropcount; } - retval = call_usermodehelper_fns(helper_argv[0], helper_argv, - NULL, UMH_WAIT_EXEC, umh_pipe_setup, - NULL, &cprm); + retval = -ENOMEM; + sub_info = call_usermodehelper_setup(helper_argv[0], + helper_argv, NULL, GFP_KERNEL, + umh_pipe_setup, NULL, &cprm); + if (sub_info) + retval = call_usermodehelper_exec(sub_info, + UMH_WAIT_EXEC); + argv_free(helper_argv); if (retval) { - printk(KERN_INFO "Core dump to %s pipe failed\n", + printk(KERN_INFO "Core dump to %s pipe failed\n", cn.corename); goto close_fail; - } + } } else { struct inode *inode; @@ -629,10 +654,11 @@ void do_coredump(siginfo_t *siginfo) goto close_fail; if (displaced) put_files_struct(displaced); - retval = binfmt->core_dump(&cprm); - if (retval) - current->signal->group_exit_code |= 0x80; - + if (!dump_interrupted()) { + file_start_write(cprm.file); + core_dumped = binfmt->core_dump(&cprm); + file_end_write(cprm.file); + } if (ispipe && core_pipe_limit) wait_for_dump_helpers(cprm.file); close_fail: @@ -644,7 +670,7 @@ fail_dropcount: fail_unlock: kfree(cn.corename); fail_corename: - coredump_finish(mm); + coredump_finish(mm, core_dumped); revert_creds(old_cred); fail_creds: put_cred(cred); @@ -659,7 +685,9 @@ fail: */ int dump_write(struct file *file, const void *addr, int nr) { - return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr; + return !dump_interrupted() && + access_ok(VERIFY_READ, addr, nr) && + file->f_op->write(file, addr, nr, &file->f_pos) == nr; } EXPORT_SYMBOL(dump_write); @@ -668,7 +696,8 @@ int dump_seek(struct file *file, loff_t off) int ret = 1; if (file->f_op->llseek && file->f_op->llseek != no_llseek) { - if (file->f_op->llseek(file, off, SEEK_CUR) < 0) + if (dump_interrupted() || + file->f_op->llseek(file, off, SEEK_CUR) < 0) return 0; } else { char *buf = (char *)get_zeroed_page(GFP_KERNEL); diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 3ceb9ec976e1..35b1c7bd18b7 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -573,6 +573,7 @@ static struct file_system_type cramfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("cramfs"); static int __init init_cramfs_fs(void) { diff --git a/fs/dcache.c b/fs/dcache.c index fbfae008ba44..f09b9085f7d8 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -337,23 +337,6 @@ static void dentry_lru_del(struct dentry *dentry) } } -/* - * Remove a dentry that is unreferenced and about to be pruned - * (unhashed and destroyed) from the LRU, and inform the file system. - * This wrapper should be called _prior_ to unhashing a victim dentry. - */ -static void dentry_lru_prune(struct dentry *dentry) -{ - if (!list_empty(&dentry->d_lru)) { - if (dentry->d_flags & DCACHE_OP_PRUNE) - dentry->d_op->d_prune(dentry); - - spin_lock(&dcache_lru_lock); - __dentry_lru_del(dentry); - spin_unlock(&dcache_lru_lock); - } -} - static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list) { spin_lock(&dcache_lru_lock); @@ -486,11 +469,13 @@ relock: if (ref) dentry->d_count--; /* - * if dentry was on the d_lru list delete it from there. * inform the fs via d_prune that this dentry is about to be * unhashed and destroyed. */ - dentry_lru_prune(dentry); + if (dentry->d_flags & DCACHE_OP_PRUNE) + dentry->d_op->d_prune(dentry); + + dentry_lru_del(dentry); /* if it was on the hash then remove it */ __d_drop(dentry); return d_kill(dentry, parent); @@ -919,11 +904,13 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) struct inode *inode; /* - * remove the dentry from the lru, and inform - * the fs that this dentry is about to be + * inform the fs that this dentry is about to be * unhashed and destroyed. */ - dentry_lru_prune(dentry); + if (dentry->d_flags & DCACHE_OP_PRUNE) + dentry->d_op->d_prune(dentry); + + dentry_lru_del(dentry); __d_shrink(dentry); if (dentry->d_count != 0) { @@ -1230,8 +1217,10 @@ void shrink_dcache_parent(struct dentry * parent) LIST_HEAD(dispose); int found; - while ((found = select_parent(parent, &dispose)) != 0) + while ((found = select_parent(parent, &dispose)) != 0) { shrink_dentry_list(&dispose); + cond_resched(); + } } EXPORT_SYMBOL(shrink_dcache_parent); @@ -2408,8 +2397,7 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) dentry->d_parent = dentry; list_del_init(&dentry->d_u.d_child); anon->d_parent = dparent; - list_del(&anon->d_u.d_child); - list_add(&anon->d_u.d_child, &dparent->d_subdirs); + list_move(&anon->d_u.d_child, &dparent->d_subdirs); write_seqcount_end(&dentry->d_seq); write_seqcount_end(&anon->d_seq); @@ -2542,7 +2530,6 @@ static int prepend_path(const struct path *path, bool slash = false; int error = 0; - br_read_lock(&vfsmount_lock); while (dentry != root->dentry || vfsmnt != root->mnt) { struct dentry * parent; @@ -2572,8 +2559,6 @@ static int prepend_path(const struct path *path, if (!error && !slash) error = prepend(buffer, buflen, "/", 1); -out: - br_read_unlock(&vfsmount_lock); return error; global_root: @@ -2590,7 +2575,7 @@ global_root: error = prepend(buffer, buflen, "/", 1); if (!error) error = is_mounted(vfsmnt) ? 1 : 2; - goto out; + return error; } /** @@ -2617,9 +2602,11 @@ char *__d_path(const struct path *path, int error; prepend(&res, &buflen, "\0", 1); + br_read_lock(&vfsmount_lock); write_seqlock(&rename_lock); error = prepend_path(path, root, &res, &buflen); write_sequnlock(&rename_lock); + br_read_unlock(&vfsmount_lock); if (error < 0) return ERR_PTR(error); @@ -2636,9 +2623,11 @@ char *d_absolute_path(const struct path *path, int error; prepend(&res, &buflen, "\0", 1); + br_read_lock(&vfsmount_lock); write_seqlock(&rename_lock); error = prepend_path(path, &root, &res, &buflen); write_sequnlock(&rename_lock); + br_read_unlock(&vfsmount_lock); if (error > 1) error = -EINVAL; @@ -2702,11 +2691,13 @@ char *d_path(const struct path *path, char *buf, int buflen) return path->dentry->d_op->d_dname(path->dentry, buf, buflen); get_fs_root(current->fs, &root); + br_read_lock(&vfsmount_lock); write_seqlock(&rename_lock); error = path_with_deleted(path, &root, &res, &buflen); + write_sequnlock(&rename_lock); + br_read_unlock(&vfsmount_lock); if (error < 0) res = ERR_PTR(error); - write_sequnlock(&rename_lock); path_put(&root); return res; } @@ -2830,6 +2821,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) get_fs_root_and_pwd(current->fs, &root, &pwd); error = -ENOENT; + br_read_lock(&vfsmount_lock); write_seqlock(&rename_lock); if (!d_unlinked(pwd.dentry)) { unsigned long len; @@ -2839,6 +2831,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) prepend(&cwd, &buflen, "\0", 1); error = prepend_path(&pwd, &root, &cwd, &buflen); write_sequnlock(&rename_lock); + br_read_unlock(&vfsmount_lock); if (error < 0) goto out; @@ -2859,6 +2852,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) } } else { write_sequnlock(&rename_lock); + br_read_unlock(&vfsmount_lock); } out: diff --git a/fs/dcookies.c b/fs/dcookies.c index 17c779967828..ab5954b50267 100644 --- a/fs/dcookies.c +++ b/fs/dcookies.c @@ -25,6 +25,7 @@ #include <linux/dcookies.h> #include <linux/mutex.h> #include <linux/path.h> +#include <linux/compat.h> #include <asm/uaccess.h> /* The dcookies are allocated from a kmem_cache and @@ -145,7 +146,7 @@ out: /* And here is where the userspace process can look up the cookie value * to retrieve the path. */ -SYSCALL_DEFINE(lookup_dcookie)(u64 cookie64, char __user * buf, size_t len) +SYSCALL_DEFINE3(lookup_dcookie, u64, cookie64, char __user *, buf, size_t, len) { unsigned long cookie = (unsigned long)cookie64; int err = -EINVAL; @@ -201,12 +202,16 @@ out: mutex_unlock(&dcookie_mutex); return err; } -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_lookup_dcookie(u64 cookie64, long buf, long len) + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, size_t, len) { - return SYSC_lookup_dcookie(cookie64, (char __user *) buf, (size_t) len); +#ifdef __BIG_ENDIAN + return sys_lookup_dcookie(((u64)w0 << 32) | w1, buf, len); +#else + return sys_lookup_dcookie(((u64)w1 << 32) | w0, buf, len); +#endif } -SYSCALL_ALIAS(sys_lookup_dcookie, SyS_lookup_dcookie); #endif static int dcookie_init(void) diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 0c4f80b447fb..4888cb3fdef7 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -299,6 +299,7 @@ static struct file_system_type debug_fs_type = { .mount = debug_mount, .kill_sb = kill_litter_super, }; +MODULE_ALIAS_FS("debugfs"); static struct dentry *__create_file(const char *name, umode_t mode, struct dentry *parent, void *data, diff --git a/fs/direct-io.c b/fs/direct-io.c index f853263cf74f..7ab90f5081ee 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -37,6 +37,7 @@ #include <linux/uio.h> #include <linux/atomic.h> #include <linux/prefetch.h> +#include <linux/aio.h> /* * How many user pages to map in one call to get_user_pages(). This determines @@ -441,8 +442,8 @@ static struct bio *dio_await_one(struct dio *dio) static int dio_bio_complete(struct dio *dio, struct bio *bio) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec; - int page_no; + struct bio_vec *bvec; + unsigned i; if (!uptodate) dio->io_error = -EIO; @@ -450,8 +451,8 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio) if (dio->is_async && dio->rw == READ) { bio_check_pages_dirty(bio); /* transfers ownership */ } else { - for (page_no = 0; page_no < bio->bi_vcnt; page_no++) { - struct page *page = bvec[page_no].bv_page; + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; if (dio->rw == READ && !PageCompound(page)) set_page_dirty_lock(page); @@ -672,12 +673,6 @@ static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio, if (sdio->final_block_in_bio != sdio->cur_page_block || cur_offset != bio_next_offset) dio_bio_submit(dio, sdio); - /* - * Submit now if the underlying fs is about to perform a - * metadata read - */ - else if (sdio->boundary) - dio_bio_submit(dio, sdio); } if (sdio->bio == NULL) { @@ -737,16 +732,6 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, sdio->cur_page_block + (sdio->cur_page_len >> sdio->blkbits) == blocknr) { sdio->cur_page_len += len; - - /* - * If sdio->boundary then we want to schedule the IO now to - * avoid metadata seeks. - */ - if (sdio->boundary) { - ret = dio_send_cur_page(dio, sdio, map_bh); - page_cache_release(sdio->cur_page); - sdio->cur_page = NULL; - } goto out; } @@ -758,7 +743,7 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, page_cache_release(sdio->cur_page); sdio->cur_page = NULL; if (ret) - goto out; + return ret; } page_cache_get(page); /* It is in dio */ @@ -768,6 +753,16 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, sdio->cur_page_block = blocknr; sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits; out: + /* + * If sdio->boundary then we want to schedule the IO now to + * avoid metadata seeks. + */ + if (sdio->boundary) { + ret = dio_send_cur_page(dio, sdio, map_bh); + dio_bio_submit(dio, sdio); + page_cache_release(sdio->cur_page); + sdio->cur_page = NULL; + } return ret; } @@ -969,7 +964,8 @@ do_holes: this_chunk_bytes = this_chunk_blocks << blkbits; BUG_ON(this_chunk_bytes == 0); - sdio->boundary = buffer_boundary(map_bh); + if (this_chunk_blocks == sdio->blocks_available) + sdio->boundary = buffer_boundary(map_bh); ret = submit_page_section(dio, sdio, page, offset_in_page, this_chunk_bytes, diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 4f5ad246582f..d0ccd2fd79eb 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -52,8 +52,8 @@ #include <linux/mutex.h> #include <linux/sctp.h> #include <linux/slab.h> +#include <linux/sctp.h> #include <net/sctp/sctp.h> -#include <net/sctp/user.h> #include <net/ipv6.h> #include "dlm_internal.h" diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index 01fd5c11a7fb..f704458ea5f5 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -247,6 +247,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, struct dlm_ls *ls; struct plock_op *op; int rv; + unsigned char fl_flags = fl->fl_flags; ls = dlm_find_lockspace_local(lockspace); if (!ls) @@ -258,9 +259,18 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, goto out; } - if (posix_lock_file_wait(file, fl) < 0) - log_error(ls, "dlm_posix_unlock: vfs unlock error %llx", - (unsigned long long)number); + /* cause the vfs unlock to return ENOENT if lock is not found */ + fl->fl_flags |= FL_EXISTS; + + rv = posix_lock_file_wait(file, fl); + if (rv == -ENOENT) { + rv = 0; + goto out_free; + } + if (rv < 0) { + log_error(ls, "dlm_posix_unlock: vfs unlock error %d %llx", + rv, (unsigned long long)number); + } op->info.optype = DLM_PLOCK_OP_UNLOCK; op->info.pid = fl->fl_pid; @@ -296,9 +306,11 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, if (rv == -ENOENT) rv = 0; +out_free: kfree(op); out: dlm_put_lockspace(ls); + fl->fl_flags = fl_flags; return rv; } EXPORT_SYMBOL_GPL(dlm_posix_unlock); diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig index e15ef38c24fa..434aa313f077 100644 --- a/fs/ecryptfs/Kconfig +++ b/fs/ecryptfs/Kconfig @@ -12,3 +12,11 @@ config ECRYPT_FS To compile this file system support as a module, choose M here: the module will be called ecryptfs. + +config ECRYPT_FS_MESSAGING + bool "Enable notifications for userspace key wrap/unwrap" + depends on ECRYPT_FS + help + Enables the /dev/ecryptfs entry for use by ecryptfsd. This allows + for userspace to wrap/unwrap file encryption keys by other + backends, like OpenSSL. diff --git a/fs/ecryptfs/Makefile b/fs/ecryptfs/Makefile index 2cc9ee4ad2eb..49678a69947d 100644 --- a/fs/ecryptfs/Makefile +++ b/fs/ecryptfs/Makefile @@ -1,7 +1,10 @@ # -# Makefile for the Linux 2.6 eCryptfs +# Makefile for the Linux eCryptfs # obj-$(CONFIG_ECRYPT_FS) += ecryptfs.o -ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o read_write.o crypto.o keystore.o messaging.o miscdev.o kthread.o debug.o +ecryptfs-y := dentry.o file.o inode.o main.o super.o mmap.o read_write.o \ + crypto.o keystore.o kthread.o debug.o + +ecryptfs-$(CONFIG_ECRYPT_FS_MESSAGING) += messaging.o miscdev.o diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index a7b0c2dfb3db..d5c25db4398f 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -301,17 +301,14 @@ int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg, while (size > 0 && i < sg_size) { pg = virt_to_page(addr); offset = offset_in_page(addr); - if (sg) - sg_set_page(&sg[i], pg, 0, offset); + sg_set_page(&sg[i], pg, 0, offset); remainder_of_page = PAGE_CACHE_SIZE - offset; if (size >= remainder_of_page) { - if (sg) - sg[i].length = remainder_of_page; + sg[i].length = remainder_of_page; addr += remainder_of_page; size -= remainder_of_page; } else { - if (sg) - sg[i].length = size; + sg[i].length = size; addr += size; size = 0; } diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c index 1b5d9af937df..bf12ba5dd223 100644 --- a/fs/ecryptfs/dentry.c +++ b/fs/ecryptfs/dentry.c @@ -45,14 +45,12 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, unsigned int flags) { struct dentry *lower_dentry; - struct vfsmount *lower_mnt; int rc = 1; if (flags & LOOKUP_RCU) return -ECHILD; lower_dentry = ecryptfs_dentry_to_lower(dentry); - lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry); if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate) goto out; rc = lower_dentry->d_op->d_revalidate(lower_dentry, flags); diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 7e2c6f5d7985..dd299b389d4e 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -172,6 +172,19 @@ ecryptfs_get_key_payload_data(struct key *key) #define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE 24 #define ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN (18 + 1 + 4 + 1 + 32) +#ifdef CONFIG_ECRYPT_FS_MESSAGING +# define ECRYPTFS_VERSIONING_MASK_MESSAGING (ECRYPTFS_VERSIONING_DEVMISC \ + | ECRYPTFS_VERSIONING_PUBKEY) +#else +# define ECRYPTFS_VERSIONING_MASK_MESSAGING 0 +#endif + +#define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \ + | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \ + | ECRYPTFS_VERSIONING_XATTR \ + | ECRYPTFS_VERSIONING_MULTKEY \ + | ECRYPTFS_VERSIONING_MASK_MESSAGING \ + | ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION) struct ecryptfs_key_sig { struct list_head crypt_stat_list; char keysig[ECRYPTFS_SIG_SIZE_HEX + 1]; @@ -399,7 +412,9 @@ struct ecryptfs_daemon { struct hlist_node euid_chain; }; +#ifdef CONFIG_ECRYPT_FS_MESSAGING extern struct mutex ecryptfs_daemon_hash_mux; +#endif static inline size_t ecryptfs_lower_header_size(struct ecryptfs_crypt_stat *crypt_stat) @@ -610,6 +625,7 @@ int ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode); +#ifdef CONFIG_ECRYPT_FS_MESSAGING int ecryptfs_process_response(struct ecryptfs_daemon *daemon, struct ecryptfs_message *msg, u32 seq); int ecryptfs_send_message(char *data, int data_len, @@ -618,6 +634,24 @@ int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx, struct ecryptfs_message **emsg); int ecryptfs_init_messaging(void); void ecryptfs_release_messaging(void); +#else +static inline int ecryptfs_init_messaging(void) +{ + return 0; +} +static inline void ecryptfs_release_messaging(void) +{ } +static inline int ecryptfs_send_message(char *data, int data_len, + struct ecryptfs_msg_ctx **msg_ctx) +{ + return -ENOTCONN; +} +static inline int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx, + struct ecryptfs_message **emsg) +{ + return -ENOMSG; +} +#endif void ecryptfs_write_header_metadata(char *virt, @@ -655,12 +689,11 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs, size_t offset_in_page, size_t size, struct inode *ecryptfs_inode); struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index); -int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon); -int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon); int ecryptfs_parse_packet_length(unsigned char *data, size_t *size, size_t *length_size); int ecryptfs_write_packet_length(char *dest, size_t size, size_t *packet_size_length); +#ifdef CONFIG_ECRYPT_FS_MESSAGING int ecryptfs_init_ecryptfs_miscdev(void); void ecryptfs_destroy_ecryptfs_miscdev(void); int ecryptfs_send_miscdev(char *data, size_t data_size, @@ -669,6 +702,9 @@ int ecryptfs_send_miscdev(char *data, size_t data_size, void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx); int ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, struct file *file); +int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon); +int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon); +#endif int ecryptfs_init_kthread(void); void ecryptfs_destroy_kthread(void); int ecryptfs_privileged_open(struct file **lower_file, diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 53acc9d0c138..201f0a0d6b0a 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -31,6 +31,7 @@ #include <linux/security.h> #include <linux/compat.h> #include <linux/fs_stack.h> +#include <linux/aio.h> #include "ecryptfs_kernel.h" /** @@ -199,7 +200,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file) struct dentry *ecryptfs_dentry = file->f_path.dentry; /* Private value of ecryptfs_dentry allocated in * ecryptfs_lookup() */ - struct dentry *lower_dentry; struct ecryptfs_file_info *file_info; mount_crypt_stat = &ecryptfs_superblock_to_private( @@ -222,7 +222,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file) rc = -ENOMEM; goto out; } - lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat; mutex_lock(&crypt_stat->cs_mutex); if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) { diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index e0f07fb6d56b..5eab400e2590 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -999,8 +999,8 @@ out: return rc; } -int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) +static int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) { struct ecryptfs_mount_crypt_stat *mount_crypt_stat; int rc = 0; @@ -1021,8 +1021,8 @@ int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry, return rc; } -int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) +static int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) { struct kstat lower_stat; int rc; diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 2333203a120b..7d52806c2119 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -1150,7 +1150,7 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, struct ecryptfs_message *msg = NULL; char *auth_tok_sig; char *payload; - size_t payload_len; + size_t payload_len = 0; int rc; rc = ecryptfs_get_auth_tok_sig(&auth_tok_sig, auth_tok); @@ -1168,7 +1168,7 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, rc = ecryptfs_send_message(payload, payload_len, &msg_ctx); if (rc) { ecryptfs_printk(KERN_ERR, "Error sending message to " - "ecryptfsd\n"); + "ecryptfsd: %d\n", rc); goto out; } rc = ecryptfs_wait_for_response(msg_ctx, &msg); @@ -1202,8 +1202,7 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, crypt_stat->key_size); } out: - if (msg) - kfree(msg); + kfree(msg); return rc; } @@ -1989,7 +1988,7 @@ pki_encrypt_session_key(struct key *auth_tok_key, rc = ecryptfs_send_message(payload, payload_len, &msg_ctx); if (rc) { ecryptfs_printk(KERN_ERR, "Error sending message to " - "ecryptfsd\n"); + "ecryptfsd: %d\n", rc); goto out; } rc = ecryptfs_wait_for_response(msg_ctx, &msg); diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 4e0886c9e5c4..e924cf45aad9 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -629,6 +629,7 @@ static struct file_system_type ecryptfs_fs_type = { .kill_sb = ecryptfs_kill_block_super, .fs_flags = 0 }; +MODULE_ALIAS_FS("ecryptfs"); /** * inode_info_init_once diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index 8d7a577ae497..49ff8ea08f1c 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -97,8 +97,7 @@ static void ecryptfs_msg_ctx_free_to_alloc(struct ecryptfs_msg_ctx *msg_ctx) void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx) { list_move(&(msg_ctx->node), &ecryptfs_msg_ctx_free_list); - if (msg_ctx->msg) - kfree(msg_ctx->msg); + kfree(msg_ctx->msg); msg_ctx->msg = NULL; msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_FREE; } @@ -283,7 +282,7 @@ ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type, int rc; rc = ecryptfs_find_daemon_by_euid(&daemon); - if (rc || !daemon) { + if (rc) { rc = -ENOTCONN; goto out; } diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c index 412e6eda25f8..e4141f257495 100644 --- a/fs/ecryptfs/miscdev.c +++ b/fs/ecryptfs/miscdev.c @@ -80,13 +80,6 @@ ecryptfs_miscdev_open(struct inode *inode, struct file *file) int rc; mutex_lock(&ecryptfs_daemon_hash_mux); - rc = try_module_get(THIS_MODULE); - if (rc == 0) { - rc = -EIO; - printk(KERN_ERR "%s: Error attempting to increment module use " - "count; rc = [%d]\n", __func__, rc); - goto out_unlock_daemon_list; - } rc = ecryptfs_find_daemon_by_euid(&daemon); if (!rc) { rc = -EINVAL; @@ -96,7 +89,7 @@ ecryptfs_miscdev_open(struct inode *inode, struct file *file) if (rc) { printk(KERN_ERR "%s: Error attempting to spawn daemon; " "rc = [%d]\n", __func__, rc); - goto out_module_put_unlock_daemon_list; + goto out_unlock_daemon_list; } mutex_lock(&daemon->mux); if (daemon->flags & ECRYPTFS_DAEMON_MISCDEV_OPEN) { @@ -108,9 +101,6 @@ ecryptfs_miscdev_open(struct inode *inode, struct file *file) atomic_inc(&ecryptfs_num_miscdev_opens); out_unlock_daemon: mutex_unlock(&daemon->mux); -out_module_put_unlock_daemon_list: - if (rc) - module_put(THIS_MODULE); out_unlock_daemon_list: mutex_unlock(&ecryptfs_daemon_hash_mux); return rc; @@ -147,7 +137,6 @@ ecryptfs_miscdev_release(struct inode *inode, struct file *file) "bug.\n", __func__, rc); BUG(); } - module_put(THIS_MODULE); return rc; } @@ -471,6 +460,7 @@ out_free: static const struct file_operations ecryptfs_miscdev_fops = { + .owner = THIS_MODULE, .open = ecryptfs_miscdev_open, .poll = ecryptfs_miscdev_poll, .read = ecryptfs_miscdev_read, diff --git a/fs/efivarfs/Kconfig b/fs/efivarfs/Kconfig new file mode 100644 index 000000000000..367bbb10c543 --- /dev/null +++ b/fs/efivarfs/Kconfig @@ -0,0 +1,12 @@ +config EFIVAR_FS + tristate "EFI Variable filesystem" + depends on EFI + help + efivarfs is a replacement filesystem for the old EFI + variable support via sysfs, as it doesn't suffer from the + same 1024-byte variable size limit. + + To compile this file system support as a module, choose M + here. The module will be called efivarfs. + + If unsure, say N. diff --git a/fs/efivarfs/Makefile b/fs/efivarfs/Makefile new file mode 100644 index 000000000000..955d478177d5 --- /dev/null +++ b/fs/efivarfs/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the efivarfs filesystem +# + +obj-$(CONFIG_EFIVAR_FS) += efivarfs.o + +efivarfs-objs := inode.o file.o super.o diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c new file mode 100644 index 000000000000..bfb531564319 --- /dev/null +++ b/fs/efivarfs/file.c @@ -0,0 +1,106 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * Copyright (C) 2012 Jeremy Kerr <jeremy.kerr@canonical.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/efi.h> +#include <linux/fs.h> +#include <linux/slab.h> + +#include "internal.h" + +static ssize_t efivarfs_file_write(struct file *file, + const char __user *userbuf, size_t count, loff_t *ppos) +{ + struct efivar_entry *var = file->private_data; + void *data; + u32 attributes; + struct inode *inode = file->f_mapping->host; + unsigned long datasize = count - sizeof(attributes); + ssize_t bytes = 0; + bool set = false; + + if (count < sizeof(attributes)) + return -EINVAL; + + if (copy_from_user(&attributes, userbuf, sizeof(attributes))) + return -EFAULT; + + if (attributes & ~(EFI_VARIABLE_MASK)) + return -EINVAL; + + data = kmalloc(datasize, GFP_KERNEL); + if (!data) + return -ENOMEM; + + if (copy_from_user(data, userbuf + sizeof(attributes), datasize)) { + bytes = -EFAULT; + goto out; + } + + bytes = efivar_entry_set_get_size(var, attributes, &datasize, + data, &set); + if (!set && bytes) + goto out; + + if (bytes == -ENOENT) { + drop_nlink(inode); + d_delete(file->f_dentry); + dput(file->f_dentry); + } else { + mutex_lock(&inode->i_mutex); + i_size_write(inode, datasize + sizeof(attributes)); + mutex_unlock(&inode->i_mutex); + } + + bytes = count; + +out: + kfree(data); + + return bytes; +} + +static ssize_t efivarfs_file_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + struct efivar_entry *var = file->private_data; + unsigned long datasize = 0; + u32 attributes; + void *data; + ssize_t size = 0; + int err; + + err = efivar_entry_size(var, &datasize); + if (err) + return err; + + data = kmalloc(datasize + sizeof(attributes), GFP_KERNEL); + + if (!data) + return -ENOMEM; + + size = efivar_entry_get(var, &attributes, &datasize, + data + sizeof(attributes)); + if (size) + goto out_free; + + memcpy(data, &attributes, sizeof(attributes)); + size = simple_read_from_buffer(userbuf, count, ppos, + data, datasize + sizeof(attributes)); +out_free: + kfree(data); + + return size; +} + +const struct file_operations efivarfs_file_operations = { + .open = simple_open, + .read = efivarfs_file_read, + .write = efivarfs_file_write, + .llseek = no_llseek, +}; diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c new file mode 100644 index 000000000000..7e787fb90293 --- /dev/null +++ b/fs/efivarfs/inode.c @@ -0,0 +1,174 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * Copyright (C) 2012 Jeremy Kerr <jeremy.kerr@canonical.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/efi.h> +#include <linux/fs.h> +#include <linux/ctype.h> +#include <linux/slab.h> + +#include "internal.h" + +struct inode *efivarfs_get_inode(struct super_block *sb, + const struct inode *dir, int mode, dev_t dev) +{ + struct inode *inode = new_inode(sb); + + if (inode) { + inode->i_ino = get_next_ino(); + inode->i_mode = mode; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + switch (mode & S_IFMT) { + case S_IFREG: + inode->i_fop = &efivarfs_file_operations; + break; + case S_IFDIR: + inode->i_op = &efivarfs_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + inc_nlink(inode); + break; + } + } + return inode; +} + +/* + * Return true if 'str' is a valid efivarfs filename of the form, + * + * VariableName-12345678-1234-1234-1234-1234567891bc + */ +bool efivarfs_valid_name(const char *str, int len) +{ + static const char dashes[EFI_VARIABLE_GUID_LEN] = { + [8] = 1, [13] = 1, [18] = 1, [23] = 1 + }; + const char *s = str + len - EFI_VARIABLE_GUID_LEN; + int i; + + /* + * We need a GUID, plus at least one letter for the variable name, + * plus the '-' separator + */ + if (len < EFI_VARIABLE_GUID_LEN + 2) + return false; + + /* GUID must be preceded by a '-' */ + if (*(s - 1) != '-') + return false; + + /* + * Validate that 's' is of the correct format, e.g. + * + * 12345678-1234-1234-1234-123456789abc + */ + for (i = 0; i < EFI_VARIABLE_GUID_LEN; i++) { + if (dashes[i]) { + if (*s++ != '-') + return false; + } else { + if (!isxdigit(*s++)) + return false; + } + } + + return true; +} + +static void efivarfs_hex_to_guid(const char *str, efi_guid_t *guid) +{ + guid->b[0] = hex_to_bin(str[6]) << 4 | hex_to_bin(str[7]); + guid->b[1] = hex_to_bin(str[4]) << 4 | hex_to_bin(str[5]); + guid->b[2] = hex_to_bin(str[2]) << 4 | hex_to_bin(str[3]); + guid->b[3] = hex_to_bin(str[0]) << 4 | hex_to_bin(str[1]); + guid->b[4] = hex_to_bin(str[11]) << 4 | hex_to_bin(str[12]); + guid->b[5] = hex_to_bin(str[9]) << 4 | hex_to_bin(str[10]); + guid->b[6] = hex_to_bin(str[16]) << 4 | hex_to_bin(str[17]); + guid->b[7] = hex_to_bin(str[14]) << 4 | hex_to_bin(str[15]); + guid->b[8] = hex_to_bin(str[19]) << 4 | hex_to_bin(str[20]); + guid->b[9] = hex_to_bin(str[21]) << 4 | hex_to_bin(str[22]); + guid->b[10] = hex_to_bin(str[24]) << 4 | hex_to_bin(str[25]); + guid->b[11] = hex_to_bin(str[26]) << 4 | hex_to_bin(str[27]); + guid->b[12] = hex_to_bin(str[28]) << 4 | hex_to_bin(str[29]); + guid->b[13] = hex_to_bin(str[30]) << 4 | hex_to_bin(str[31]); + guid->b[14] = hex_to_bin(str[32]) << 4 | hex_to_bin(str[33]); + guid->b[15] = hex_to_bin(str[34]) << 4 | hex_to_bin(str[35]); +} + +static int efivarfs_create(struct inode *dir, struct dentry *dentry, + umode_t mode, bool excl) +{ + struct inode *inode; + struct efivar_entry *var; + int namelen, i = 0, err = 0; + + if (!efivarfs_valid_name(dentry->d_name.name, dentry->d_name.len)) + return -EINVAL; + + inode = efivarfs_get_inode(dir->i_sb, dir, mode, 0); + if (!inode) + return -ENOMEM; + + var = kzalloc(sizeof(struct efivar_entry), GFP_KERNEL); + if (!var) { + err = -ENOMEM; + goto out; + } + + /* length of the variable name itself: remove GUID and separator */ + namelen = dentry->d_name.len - EFI_VARIABLE_GUID_LEN - 1; + + efivarfs_hex_to_guid(dentry->d_name.name + namelen + 1, + &var->var.VendorGuid); + + for (i = 0; i < namelen; i++) + var->var.VariableName[i] = dentry->d_name.name[i]; + + var->var.VariableName[i] = '\0'; + + inode->i_private = var; + + efivar_entry_add(var, &efivarfs_list); + d_instantiate(dentry, inode); + dget(dentry); +out: + if (err) { + kfree(var); + iput(inode); + } + return err; +} + +static int efivarfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct efivar_entry *var = dentry->d_inode->i_private; + + if (efivar_entry_delete(var)) + return -EINVAL; + + drop_nlink(dentry->d_inode); + dput(dentry); + return 0; +}; + +/* + * Handle negative dentry. + */ +static struct dentry *efivarfs_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + if (dentry->d_name.len > NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + d_add(dentry, NULL); + return NULL; +} + +const struct inode_operations efivarfs_dir_inode_operations = { + .lookup = efivarfs_lookup, + .unlink = efivarfs_unlink, + .create = efivarfs_create, +}; diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h new file mode 100644 index 000000000000..b5ff16addb7c --- /dev/null +++ b/fs/efivarfs/internal.h @@ -0,0 +1,22 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * Copyright (C) 2012 Jeremy Kerr <jeremy.kerr@canonical.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef EFIVAR_FS_INTERNAL_H +#define EFIVAR_FS_INTERNAL_H + +#include <linux/list.h> + +extern const struct file_operations efivarfs_file_operations; +extern const struct inode_operations efivarfs_dir_inode_operations; +extern bool efivarfs_valid_name(const char *str, int len); +extern struct inode *efivarfs_get_inode(struct super_block *sb, + const struct inode *dir, int mode, dev_t dev); + +extern struct list_head efivarfs_list; + +#endif /* EFIVAR_FS_INTERNAL_H */ diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c new file mode 100644 index 000000000000..141aee31884f --- /dev/null +++ b/fs/efivarfs/super.c @@ -0,0 +1,270 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * Copyright (C) 2012 Jeremy Kerr <jeremy.kerr@canonical.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/ctype.h> +#include <linux/efi.h> +#include <linux/fs.h> +#include <linux/module.h> +#include <linux/pagemap.h> +#include <linux/ucs2_string.h> +#include <linux/slab.h> +#include <linux/magic.h> + +#include "internal.h" + +LIST_HEAD(efivarfs_list); + +static void efivarfs_evict_inode(struct inode *inode) +{ + clear_inode(inode); +} + +static const struct super_operations efivarfs_ops = { + .statfs = simple_statfs, + .drop_inode = generic_delete_inode, + .evict_inode = efivarfs_evict_inode, + .show_options = generic_show_options, +}; + +static struct super_block *efivarfs_sb; + +/* + * Compare two efivarfs file names. + * + * An efivarfs filename is composed of two parts, + * + * 1. A case-sensitive variable name + * 2. A case-insensitive GUID + * + * So we need to perform a case-sensitive match on part 1 and a + * case-insensitive match on part 2. + */ +static int efivarfs_d_compare(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, + const struct qstr *name) +{ + int guid = len - EFI_VARIABLE_GUID_LEN; + + if (name->len != len) + return 1; + + /* Case-sensitive compare for the variable name */ + if (memcmp(str, name->name, guid)) + return 1; + + /* Case-insensitive compare for the GUID */ + return strncasecmp(name->name + guid, str + guid, EFI_VARIABLE_GUID_LEN); +} + +static int efivarfs_d_hash(const struct dentry *dentry, + const struct inode *inode, struct qstr *qstr) +{ + unsigned long hash = init_name_hash(); + const unsigned char *s = qstr->name; + unsigned int len = qstr->len; + + if (!efivarfs_valid_name(s, len)) + return -EINVAL; + + while (len-- > EFI_VARIABLE_GUID_LEN) + hash = partial_name_hash(*s++, hash); + + /* GUID is case-insensitive. */ + while (len--) + hash = partial_name_hash(tolower(*s++), hash); + + qstr->hash = end_name_hash(hash); + return 0; +} + +/* + * Retaining negative dentries for an in-memory filesystem just wastes + * memory and lookup time: arrange for them to be deleted immediately. + */ +static int efivarfs_delete_dentry(const struct dentry *dentry) +{ + return 1; +} + +static struct dentry_operations efivarfs_d_ops = { + .d_compare = efivarfs_d_compare, + .d_hash = efivarfs_d_hash, + .d_delete = efivarfs_delete_dentry, +}; + +static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name) +{ + struct dentry *d; + struct qstr q; + int err; + + q.name = name; + q.len = strlen(name); + + err = efivarfs_d_hash(NULL, NULL, &q); + if (err) + return ERR_PTR(err); + + d = d_alloc(parent, &q); + if (d) + return d; + + return ERR_PTR(-ENOMEM); +} + +static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, + unsigned long name_size, void *data) +{ + struct super_block *sb = (struct super_block *)data; + struct efivar_entry *entry; + struct inode *inode = NULL; + struct dentry *dentry, *root = sb->s_root; + unsigned long size = 0; + char *name; + int len, i; + int err = -ENOMEM; + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return err; + + memcpy(entry->var.VariableName, name16, name_size); + memcpy(&(entry->var.VendorGuid), &vendor, sizeof(efi_guid_t)); + + len = ucs2_strlen(entry->var.VariableName); + + /* name, plus '-', plus GUID, plus NUL*/ + name = kmalloc(len + 1 + EFI_VARIABLE_GUID_LEN + 1, GFP_KERNEL); + if (!name) + goto fail; + + for (i = 0; i < len; i++) + name[i] = entry->var.VariableName[i] & 0xFF; + + name[len] = '-'; + + efi_guid_unparse(&entry->var.VendorGuid, name + len + 1); + + name[len + EFI_VARIABLE_GUID_LEN+1] = '\0'; + + inode = efivarfs_get_inode(sb, root->d_inode, S_IFREG | 0644, 0); + if (!inode) + goto fail_name; + + dentry = efivarfs_alloc_dentry(root, name); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto fail_inode; + } + + /* copied by the above to local storage in the dentry. */ + kfree(name); + + efivar_entry_size(entry, &size); + efivar_entry_add(entry, &efivarfs_list); + + mutex_lock(&inode->i_mutex); + inode->i_private = entry; + i_size_write(inode, size + sizeof(entry->var.Attributes)); + mutex_unlock(&inode->i_mutex); + d_add(dentry, inode); + + return 0; + +fail_inode: + iput(inode); +fail_name: + kfree(name); +fail: + kfree(entry); + return err; +} + +static int efivarfs_destroy(struct efivar_entry *entry, void *data) +{ + efivar_entry_remove(entry); + kfree(entry); + return 0; +} + +static int efivarfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct inode *inode = NULL; + struct dentry *root; + int err; + + efivarfs_sb = sb; + + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = EFIVARFS_MAGIC; + sb->s_op = &efivarfs_ops; + sb->s_d_op = &efivarfs_d_ops; + sb->s_time_gran = 1; + + inode = efivarfs_get_inode(sb, NULL, S_IFDIR | 0755, 0); + if (!inode) + return -ENOMEM; + inode->i_op = &efivarfs_dir_inode_operations; + + root = d_make_root(inode); + sb->s_root = root; + if (!root) + return -ENOMEM; + + INIT_LIST_HEAD(&efivarfs_list); + + err = efivar_init(efivarfs_callback, (void *)sb, false, + true, &efivarfs_list); + if (err) + __efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL); + + return err; +} + +static struct dentry *efivarfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_single(fs_type, flags, data, efivarfs_fill_super); +} + +static void efivarfs_kill_sb(struct super_block *sb) +{ + kill_litter_super(sb); + efivarfs_sb = NULL; + + /* Remove all entries and destroy */ + __efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL); +} + +static struct file_system_type efivarfs_type = { + .name = "efivarfs", + .mount = efivarfs_mount, + .kill_sb = efivarfs_kill_sb, +}; + +static __init int efivarfs_init(void) +{ + if (!efi_enabled(EFI_RUNTIME_SERVICES)) + return 0; + + if (!efivars_kobject()) + return 0; + + return register_filesystem(&efivarfs_type); +} + +MODULE_AUTHOR("Matthew Garrett, Jeremy Kerr"); +MODULE_DESCRIPTION("EFI Variable Filesystem"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_FS("efivarfs"); + +module_init(efivarfs_init); diff --git a/fs/efs/super.c b/fs/efs/super.c index 2002431ef9a0..c6f57a74a559 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -33,6 +33,7 @@ static struct file_system_type efs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("efs"); static struct pt_types sgi_pt_types[] = { {0x00, "SGI vh"}, diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 9fec1836057a..deecc7294a67 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -40,6 +40,7 @@ #include <linux/atomic.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/compat.h> /* * LOCKING: @@ -104,7 +105,7 @@ struct epoll_filefd { struct file *file; int fd; -}; +} __packed; /* * Structure used to track possible nested calls, for too deep recursions @@ -128,6 +129,8 @@ struct nested_calls { /* * Each file descriptor added to the eventpoll interface will * have an entry of this type linked to the "rbr" RB tree. + * Avoid increasing the size of this struct, there can be many thousands + * of these on a server and we do not want this to take another cache line. */ struct epitem { /* RB tree node used to link this structure to the eventpoll RB tree */ @@ -158,7 +161,7 @@ struct epitem { struct list_head fllink; /* wakeup_source used when EPOLLWAKEUP is set */ - struct wakeup_source *ws; + struct wakeup_source __rcu *ws; /* The structure that describe the interested events and the source fd */ struct epoll_event event; @@ -536,6 +539,38 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) } } +/* call only when ep->mtx is held */ +static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi) +{ + return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx)); +} + +/* call only when ep->mtx is held */ +static inline void ep_pm_stay_awake(struct epitem *epi) +{ + struct wakeup_source *ws = ep_wakeup_source(epi); + + if (ws) + __pm_stay_awake(ws); +} + +static inline bool ep_has_wakeup_source(struct epitem *epi) +{ + return rcu_access_pointer(epi->ws) ? true : false; +} + +/* call when ep->mtx cannot be held (ep_poll_callback) */ +static inline void ep_pm_stay_awake_rcu(struct epitem *epi) +{ + struct wakeup_source *ws; + + rcu_read_lock(); + ws = rcu_dereference(epi->ws); + if (ws) + __pm_stay_awake(ws); + rcu_read_unlock(); +} + /** * ep_scan_ready_list - Scans the ready list in a way that makes possible for * the scan code, to call f_op->poll(). Also allows for @@ -599,7 +634,7 @@ static int ep_scan_ready_list(struct eventpoll *ep, */ if (!ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); - __pm_stay_awake(epi->ws); + ep_pm_stay_awake(epi); } } /* @@ -668,7 +703,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) list_del_init(&epi->rdllink); spin_unlock_irqrestore(&ep->lock, flags); - wakeup_source_unregister(epi->ws); + wakeup_source_unregister(ep_wakeup_source(epi)); /* At this point it is safe to free the eventpoll item */ kmem_cache_free(epi_cache, epi); @@ -711,11 +746,15 @@ static void ep_free(struct eventpoll *ep) * point we are sure no poll callbacks will be lingering around, and also by * holding "epmutex" we can be sure that no file cleanup code will hit * us during this operation. So we can avoid the lock on "ep->lock". + * We do not need to lock ep->mtx, either, we only do it to prevent + * a lockdep warning. */ + mutex_lock(&ep->mtx); while ((rbp = rb_first(&ep->rbr)) != NULL) { epi = rb_entry(rbp, struct epitem, rbn); ep_remove(ep, epi); } + mutex_unlock(&ep->mtx); mutex_unlock(&epmutex); mutex_destroy(&ep->mtx); @@ -734,6 +773,13 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file) return 0; } +static inline unsigned int ep_item_poll(struct epitem *epi, poll_table *pt) +{ + pt->_key = epi->event.events; + + return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events; +} + static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, void *priv) { @@ -741,10 +787,9 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, poll_table pt; init_poll_funcptr(&pt, NULL); + list_for_each_entry_safe(epi, tmp, head, rdllink) { - pt._key = epi->event.events; - if (epi->ffd.file->f_op->poll(epi->ffd.file, &pt) & - epi->event.events) + if (ep_item_poll(epi, &pt)) return POLLIN | POLLRDNORM; else { /* @@ -752,7 +797,7 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, * callback, but it's not actually ready, as far as * caller requested events goes. We can remove it here. */ - __pm_relax(epi->ws); + __pm_relax(ep_wakeup_source(epi)); list_del_init(&epi->rdllink); } } @@ -984,7 +1029,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k /* If this file is already in the ready list we exit soon */ if (!ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); - __pm_stay_awake(epi->ws); + ep_pm_stay_awake_rcu(epi); } /* @@ -1146,6 +1191,7 @@ static int reverse_path_check(void) static int ep_create_wakeup_source(struct epitem *epi) { const char *name; + struct wakeup_source *ws; if (!epi->ep->ws) { epi->ep->ws = wakeup_source_register("eventpoll"); @@ -1154,17 +1200,29 @@ static int ep_create_wakeup_source(struct epitem *epi) } name = epi->ffd.file->f_path.dentry->d_name.name; - epi->ws = wakeup_source_register(name); - if (!epi->ws) + ws = wakeup_source_register(name); + + if (!ws) return -ENOMEM; + rcu_assign_pointer(epi->ws, ws); return 0; } -static void ep_destroy_wakeup_source(struct epitem *epi) +/* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */ +static noinline void ep_destroy_wakeup_source(struct epitem *epi) { - wakeup_source_unregister(epi->ws); - epi->ws = NULL; + struct wakeup_source *ws = ep_wakeup_source(epi); + + RCU_INIT_POINTER(epi->ws, NULL); + + /* + * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is + * used internally by wakeup_source_remove, too (called by + * wakeup_source_unregister), so we cannot use call_rcu + */ + synchronize_rcu(); + wakeup_source_unregister(ws); } /* @@ -1199,13 +1257,12 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, if (error) goto error_create_wakeup_source; } else { - epi->ws = NULL; + RCU_INIT_POINTER(epi->ws, NULL); } /* Initialize the poll table using the queue callback */ epq.epi = epi; init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); - epq.pt._key = event->events; /* * Attach the item to the poll hooks and get current event bits. @@ -1214,7 +1271,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, * this operation completes, the poll callback can start hitting * the new item. */ - revents = tfile->f_op->poll(tfile, &epq.pt); + revents = ep_item_poll(epi, &epq.pt); /* * We have to check if something went wrong during the poll wait queue @@ -1247,7 +1304,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, /* If the file is already "ready" we drop it inside the ready list */ if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); - __pm_stay_awake(epi->ws); + ep_pm_stay_awake(epi); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) @@ -1288,7 +1345,7 @@ error_unregister: list_del_init(&epi->rdllink); spin_unlock_irqrestore(&ep->lock, flags); - wakeup_source_unregister(epi->ws); + wakeup_source_unregister(ep_wakeup_source(epi)); error_create_wakeup_source: kmem_cache_free(epi_cache, epi); @@ -1314,12 +1371,11 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even * f_op->poll() call and the new event set registering. */ epi->event.events = event->events; /* need barrier below */ - pt._key = event->events; epi->event.data = event->data; /* protected by mtx */ if (epi->event.events & EPOLLWAKEUP) { - if (!epi->ws) + if (!ep_has_wakeup_source(epi)) ep_create_wakeup_source(epi); - } else if (epi->ws) { + } else if (ep_has_wakeup_source(epi)) { ep_destroy_wakeup_source(epi); } @@ -1347,7 +1403,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even * Get current event bits. We can safely use the file* here because * its usage count has been increased by the caller of this function. */ - revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt); + revents = ep_item_poll(epi, &pt); /* * If the item is "hot" and it is not registered inside the ready @@ -1357,7 +1413,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even spin_lock_irq(&ep->lock); if (!ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); - __pm_stay_awake(epi->ws); + ep_pm_stay_awake(epi); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) @@ -1383,6 +1439,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, unsigned int revents; struct epitem *epi; struct epoll_event __user *uevent; + struct wakeup_source *ws; poll_table pt; init_poll_funcptr(&pt, NULL); @@ -1405,14 +1462,16 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, * instead, but then epi->ws would temporarily be out of sync * with ep_is_linked(). */ - if (epi->ws && epi->ws->active) - __pm_stay_awake(ep->ws); - __pm_relax(epi->ws); + ws = ep_wakeup_source(epi); + if (ws) { + if (ws->active) + __pm_stay_awake(ep->ws); + __pm_relax(ws); + } + list_del_init(&epi->rdllink); - pt._key = epi->event.events; - revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt) & - epi->event.events; + revents = ep_item_poll(epi, &pt); /* * If the event mask intersect the caller-requested one, @@ -1424,7 +1483,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, if (__put_user(revents, &uevent->events) || __put_user(epi->event.data, &uevent->data)) { list_add(&epi->rdllink, head); - __pm_stay_awake(epi->ws); + ep_pm_stay_awake(epi); return eventcnt ? eventcnt : -EFAULT; } eventcnt++; @@ -1444,7 +1503,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, * poll callback will queue them in ep->ovflist. */ list_add_tail(&epi->rdllink, &ep->rdllist); - __pm_stay_awake(epi->ws); + ep_pm_stay_awake(epi); } } } @@ -1940,6 +1999,52 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, return error; } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, + struct epoll_event __user *, events, + int, maxevents, int, timeout, + const compat_sigset_t __user *, sigmask, + compat_size_t, sigsetsize) +{ + long err; + compat_sigset_t csigmask; + sigset_t ksigmask, sigsaved; + + /* + * If the caller wants a certain signal mask to be set during the wait, + * we apply it here. + */ + if (sigmask) { + if (sigsetsize != sizeof(compat_sigset_t)) + return -EINVAL; + if (copy_from_user(&csigmask, sigmask, sizeof(csigmask))) + return -EFAULT; + sigset_from_compat(&ksigmask, &csigmask); + sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); + sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + } + + err = sys_epoll_wait(epfd, events, maxevents, timeout); + + /* + * If we changed the signal mask, we need to restore the original one. + * In case we've got a signal while waiting, we do not restore the + * signal mask yet, and we allow do_signal() to deliver the signal on + * the way back to userspace, before the signal mask is restored. + */ + if (sigmask) { + if (err == -EINTR) { + memcpy(¤t->saved_sigmask, &sigsaved, + sizeof(sigsaved)); + set_restore_sigmask(); + } else + sigprocmask(SIG_SETMASK, &sigsaved, NULL); + } + + return err; +} +#endif + static int __init eventpoll_init(void) { struct sysinfo si; @@ -1964,6 +2069,12 @@ static int __init eventpoll_init(void) /* Initialize the structure used to perform file's f_op->poll() calls */ ep_nested_calls_init(&poll_readywalk_ncalls); + /* + * We can have many thousands of epitems, so prevent this from + * using an extra cache line on 64-bit (and smaller) CPUs + */ + BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128); + /* Allocates slab cache used to allocate "struct epitem" items */ epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); diff --git a/fs/exec.c b/fs/exec.c index a96a4885bbbf..643019585574 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -613,7 +613,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) * when the old and new regions overlap clear from new_end. */ free_pgd_range(&tlb, new_end, old_end, new_end, - vma->vm_next ? vma->vm_next->vm_start : 0); + vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); } else { /* * otherwise, clean from old_start; this is done to not touch @@ -622,7 +622,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) * for the others its just a little faster. */ free_pgd_range(&tlb, old_start, old_end, new_end, - vma->vm_next ? vma->vm_next->vm_start : 0); + vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); } tlb_finish_mmu(&tlb, new_end, old_end); @@ -802,6 +802,15 @@ int kernel_read(struct file *file, loff_t offset, EXPORT_SYMBOL(kernel_read); +ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) +{ + ssize_t res = file->f_op->read(file, (void __user *)addr, len, &pos); + if (res > 0) + flush_icache_range(addr, addr + len); + return res; +} +EXPORT_SYMBOL(read_code); + static int exec_mmap(struct mm_struct *mm) { struct task_struct *tsk; @@ -898,11 +907,13 @@ static int de_thread(struct task_struct *tsk) sig->notify_count = -1; /* for exit_notify() */ for (;;) { + threadgroup_change_begin(tsk); write_lock_irq(&tasklist_lock); if (likely(leader->exit_state)) break; __set_current_state(TASK_KILLABLE); write_unlock_irq(&tasklist_lock); + threadgroup_change_end(tsk); schedule(); if (unlikely(__fatal_signal_pending(tsk))) goto killed; @@ -960,6 +971,7 @@ static int de_thread(struct task_struct *tsk) if (unlikely(leader->ptrace)) __wake_up_parent(leader, leader->parent); write_unlock_irq(&tasklist_lock); + threadgroup_change_end(tsk); release_task(leader); } @@ -1027,17 +1039,7 @@ EXPORT_SYMBOL_GPL(get_task_comm); void set_task_comm(struct task_struct *tsk, char *buf) { task_lock(tsk); - trace_task_rename(tsk, buf); - - /* - * Threads may access current->comm without holding - * the task lock, so write the string carefully. - * Readers without a lock may see incomplete new - * names but are safe from non-terminating string reads. - */ - memset(tsk->comm, 0, TASK_COMM_LEN); - wmb(); strlcpy(tsk->comm, buf, sizeof(tsk->comm)); task_unlock(tsk); perf_event_comm(tsk); diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index f936cb50dc0d..b74422888604 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -401,7 +401,7 @@ static void _clear_bio(struct bio *bio) struct bio_vec *bv; unsigned i; - __bio_for_each_segment(bv, bio, i, 0) { + bio_for_each_segment_all(bv, bio, i) { unsigned this_count = bv->bv_len; if (likely(PAGE_SIZE == this_count)) diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index b963f38ac298..7682b970d0f1 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -432,7 +432,7 @@ static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) if (!bio) continue; - __bio_for_each_segment(bv, bio, i, 0) { + bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; SetPageUptodate(page); diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 5e59280d42d7..9d9763328734 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -1010,6 +1010,7 @@ static struct file_system_type exofs_type = { .mount = exofs_mount, .kill_sb = generic_shutdown_super, }; +MODULE_ALIAS_FS("exofs"); static int __init init_exofs(void) { diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 8f370e012e61..7cadd823bb31 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -118,7 +118,6 @@ void ext2_free_inode (struct inode * inode) * as writing the quota to disk may need the lock as well. */ /* Quota is already initialized in iput() */ - ext2_xattr_delete_inode(inode); dquot_free_inode(inode); dquot_drop(inode); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index c3881e56662e..0a87bb10998d 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -31,9 +31,11 @@ #include <linux/mpage.h> #include <linux/fiemap.h> #include <linux/namei.h> +#include <linux/aio.h> #include "ext2.h" #include "acl.h" #include "xip.h" +#include "xattr.h" static int __ext2_write_inode(struct inode *inode, int do_sync); @@ -88,6 +90,7 @@ void ext2_evict_inode(struct inode * inode) inode->i_size = 0; if (inode->i_blocks) ext2_truncate_blocks(inode, 0); + ext2_xattr_delete_inode(inode); } invalidate_inode_buffers(inode); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 7f68c8114026..288534920fe5 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1536,6 +1536,7 @@ static struct file_system_type ext2_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("ext2"); static int __init init_ext2_fs(void) { diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index d512c4bc4ad7..23c712825640 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -27,6 +27,7 @@ #include <linux/writeback.h> #include <linux/mpage.h> #include <linux/namei.h> +#include <linux/aio.h> #include "ext3.h" #include "xattr.h" #include "acl.h" @@ -218,7 +219,8 @@ void ext3_evict_inode (struct inode *inode) */ if (inode->i_nlink && ext3_should_journal_data(inode) && EXT3_SB(inode->i_sb)->s_journal && - (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && + inode->i_ino != EXT3_JOURNAL_INO) { tid_t commit_tid = atomic_read(&ei->i_datasync_tid); journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 5546ca225ffe..6356665a74bb 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -353,7 +353,7 @@ static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb) return bdev; fail: - ext3_msg(sb, "error: failed to open journal device %s: %ld", + ext3_msg(sb, KERN_ERR, "error: failed to open journal device %s: %ld", __bdevname(dev, b), PTR_ERR(bdev)); return NULL; @@ -362,22 +362,19 @@ fail: /* * Release the journal device */ -static int ext3_blkdev_put(struct block_device *bdev) +static void ext3_blkdev_put(struct block_device *bdev) { - return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } -static int ext3_blkdev_remove(struct ext3_sb_info *sbi) +static void ext3_blkdev_remove(struct ext3_sb_info *sbi) { struct block_device *bdev; - int ret = -ENODEV; - bdev = sbi->journal_bdev; if (bdev) { - ret = ext3_blkdev_put(bdev); + ext3_blkdev_put(bdev); sbi->journal_bdev = NULL; } - return ret; } static inline struct inode *orphan_list_entry(struct list_head *l) @@ -887,7 +884,7 @@ static ext3_fsblk_t get_sb_block(void **data, struct super_block *sb) /*todo: use simple_strtoll with >32bit ext3 */ sb_block = simple_strtoul(options, &options, 0); if (*options && *options != ',') { - ext3_msg(sb, "error: invalid sb specification: %s", + ext3_msg(sb, KERN_ERR, "error: invalid sb specification: %s", (char *) *data); return 1; } @@ -2067,7 +2064,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal": test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": "writeback"); - sb->s_flags |= MS_SNAP_STABLE; return 0; @@ -3068,6 +3064,7 @@ static struct file_system_type ext3_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("ext3"); static int __init init_ext3_fs(void) { diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 987358740cb9..efea5d5c44ce 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -71,4 +71,5 @@ config EXT4_DEBUG Enables run-time debugging support for the ext4 filesystem. If you select Y here, then you will be able to turn on debugging - with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug" + with a command such as: + echo 1 > /sys/module/ext4/parameters/mballoc_debug diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 92e68b33fffd..d0f13eada0ed 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -30,6 +30,23 @@ static unsigned ext4_num_base_meta_clusters(struct super_block *sb, */ /* + * Calculate block group number for a given block number + */ +ext4_group_t ext4_get_group_number(struct super_block *sb, + ext4_fsblk_t block) +{ + ext4_group_t group; + + if (test_opt2(sb, STD_GROUP_SIZE)) + group = (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + + block) >> + (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3); + else + ext4_get_group_no_and_offset(sb, block, &group, NULL); + return group; +} + +/* * Calculate the block group number and offset into the block/cluster * allocation bitmap, given a block number */ @@ -49,14 +66,18 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, } -static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block, - ext4_group_t block_group) +/* + * Check whether the 'block' lives within the 'block_group'. Returns 1 if so + * and 0 otherwise. + */ +static inline int ext4_block_in_group(struct super_block *sb, + ext4_fsblk_t block, + ext4_group_t block_group) { ext4_group_t actual_group; - ext4_get_group_no_and_offset(sb, block, &actual_group, NULL); - if (actual_group == block_group) - return 1; - return 0; + + actual_group = ext4_get_group_number(sb, block); + return (actual_group == block_group) ? 1 : 0; } /* Return the number of clusters used for file system metadata; this @@ -420,7 +441,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) trace_ext4_read_block_bitmap_load(sb, block_group); bh->b_end_io = ext4_end_bitmap_read; get_bh(bh); - submit_bh(READ, bh); + submit_bh(READ | REQ_META | REQ_PRIO, bh); return bh; verify: ext4_validate_block_bitmap(sb, desc, block_group, bh); @@ -478,20 +499,22 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) static int ext4_has_free_clusters(struct ext4_sb_info *sbi, s64 nclusters, unsigned int flags) { - s64 free_clusters, dirty_clusters, root_clusters; + s64 free_clusters, dirty_clusters, rsv, resv_clusters; struct percpu_counter *fcc = &sbi->s_freeclusters_counter; struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter; free_clusters = percpu_counter_read_positive(fcc); dirty_clusters = percpu_counter_read_positive(dcc); + resv_clusters = atomic64_read(&sbi->s_resv_clusters); /* * r_blocks_count should always be multiple of the cluster ratio so * we are safe to do a plane bit shift only. */ - root_clusters = ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits; + rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) + + resv_clusters; - if (free_clusters - (nclusters + root_clusters + dirty_clusters) < + if (free_clusters - (nclusters + rsv + dirty_clusters) < EXT4_FREECLUSTERS_WATERMARK) { free_clusters = percpu_counter_sum_positive(fcc); dirty_clusters = percpu_counter_sum_positive(dcc); @@ -499,15 +522,21 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi, /* Check whether we have space after accounting for current * dirty clusters & root reserved clusters. */ - if (free_clusters >= ((root_clusters + nclusters) + dirty_clusters)) + if (free_clusters >= (rsv + nclusters + dirty_clusters)) return 1; /* Hm, nope. Are (enough) root reserved clusters available? */ if (uid_eq(sbi->s_resuid, current_fsuid()) || (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) || capable(CAP_SYS_RESOURCE) || - (flags & EXT4_MB_USE_ROOT_BLOCKS)) { + (flags & EXT4_MB_USE_ROOT_BLOCKS)) { + if (free_clusters >= (nclusters + dirty_clusters + + resv_clusters)) + return 1; + } + /* No free blocks. Let's see if we can dip into reserved pool */ + if (flags & EXT4_MB_USE_RESERVED) { if (free_clusters >= (nclusters + dirty_clusters)) return 1; } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index d8cd1f0f4661..f8d56e4254e0 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -46,7 +46,8 @@ static int is_dx_dir(struct inode *inode) if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_COMPAT_DIR_INDEX) && ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || - ((inode->i_size >> sb->s_blocksize_bits) == 1))) + ((inode->i_size >> sb->s_blocksize_bits) == 1) || + ext4_has_inline_data(inode))) return 1; return 0; @@ -115,14 +116,6 @@ static int ext4_readdir(struct file *filp, int ret = 0; int dir_has_error = 0; - if (ext4_has_inline_data(inode)) { - int has_inline_data = 1; - ret = ext4_read_inline_dir(filp, dirent, filldir, - &has_inline_data); - if (has_inline_data) - return ret; - } - if (is_dx_dir(inode)) { err = ext4_dx_readdir(filp, dirent, filldir); if (err != ERR_BAD_DX_DIR) { @@ -136,6 +129,15 @@ static int ext4_readdir(struct file *filp, ext4_clear_inode_flag(file_inode(filp), EXT4_INODE_INDEX); } + + if (ext4_has_inline_data(inode)) { + int has_inline_data = 1; + ret = ext4_read_inline_dir(filp, dirent, filldir, + &has_inline_data); + if (has_inline_data) + return ret; + } + stored = 0; offset = filp->f_pos & (sb->s_blocksize - 1); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4a01ba315262..0aabb344b02e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -121,6 +121,8 @@ typedef unsigned int ext4_group_t; #define EXT4_MB_STREAM_ALLOC 0x0800 /* Use reserved root blocks if needed */ #define EXT4_MB_USE_ROOT_BLOCKS 0x1000 +/* Use blocks from reserved pool */ +#define EXT4_MB_USE_RESERVED 0x2000 struct ext4_allocation_request { /* target inode for block we're allocating */ @@ -196,19 +198,8 @@ struct mpage_da_data { #define EXT4_IO_END_ERROR 0x0002 #define EXT4_IO_END_DIRECT 0x0004 -struct ext4_io_page { - struct page *p_page; - atomic_t p_count; -}; - -#define MAX_IO_PAGES 128 - /* * For converting uninitialized extents on a work queue. - * - * 'page' is only used from the writepage() path; 'pages' is only used for - * buffered writes; they are used to keep page references until conversion - * takes place. For AIO/DIO, neither field is filled in. */ typedef struct ext4_io_end { struct list_head list; /* per-file finished IO list */ @@ -218,15 +209,13 @@ typedef struct ext4_io_end { ssize_t size; /* size of the extent */ struct kiocb *iocb; /* iocb struct for AIO */ int result; /* error value for AIO */ - int num_io_pages; /* for writepages() */ - struct ext4_io_page *pages[MAX_IO_PAGES]; /* for writepages() */ + atomic_t count; /* reference counter */ } ext4_io_end_t; struct ext4_io_submit { int io_op; struct bio *io_bio; ext4_io_end_t *io_end; - struct ext4_io_page *io_page; sector_t io_next_block; }; @@ -335,9 +324,9 @@ struct ext4_group_desc */ struct flex_groups { - atomic_t free_inodes; - atomic_t free_clusters; - atomic_t used_dirs; + atomic64_t free_clusters; + atomic_t free_inodes; + atomic_t used_dirs; }; #define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ @@ -403,7 +392,7 @@ struct flex_groups { #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ #define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ -#define EXT4_FL_USER_MODIFIABLE 0x004B80FF /* User modifiable flags */ +#define EXT4_FL_USER_MODIFIABLE 0x004380FF /* User modifiable flags */ /* Flags that should be inherited by new inodes from their parent. */ #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ @@ -557,9 +546,8 @@ enum { #define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002 #define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\ EXT4_GET_BLOCKS_CREATE) - /* Caller is from the delayed allocation writeout path, - so set the magic i_delalloc_reserve_flag after taking the - inode allocation semaphore for */ + /* Caller is from the delayed allocation writeout path + * finally doing the actual allocation of delayed blocks */ #define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 /* caller is from the direct IO path, request to creation of an unitialized extents if not allocated, split the uninitialized @@ -571,8 +559,9 @@ enum { /* Convert extent to initialized after IO complete */ #define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) - /* Punch out blocks of an extent */ -#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020 + /* Eventual metadata allocation (due to growing extent tree) + * should not fail, so try to use reserved blocks for that.*/ +#define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 /* Don't normalize allocation size (used for fallocate) */ #define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 /* Request will not result in inode size update (user for fallocate) */ @@ -616,6 +605,7 @@ enum { #define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) #define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) #define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) +#define EXT4_IOC_SWAP_BOOT _IO('f', 17) #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* @@ -949,7 +939,7 @@ struct ext4_inode_info { #define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ /* - * Mount flags + * Mount flags set via mount options or defaults */ #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ @@ -981,8 +971,16 @@ struct ext4_inode_info { #define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ #define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ +/* + * Mount flags set either automatically (could not be set by mount option) + * based on per file system feature or property or in special cases such as + * distinguishing between explicit mount option definition and default. + */ #define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly specified delalloc */ +#define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group + size of blocksize * 8 + blocks */ #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ ~EXT4_MOUNT_##opt @@ -1179,6 +1177,7 @@ struct ext4_sb_info { unsigned int s_mount_flags; unsigned int s_def_mount_opt; ext4_fsblk_t s_sb_block; + atomic64_t s_resv_clusters; kuid_t s_resuid; kgid_t s_resgid; unsigned short s_mount_state; @@ -1333,6 +1332,7 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) return ino == EXT4_ROOT_INO || ino == EXT4_USR_QUOTA_INO || ino == EXT4_GRP_QUOTA_INO || + ino == EXT4_BOOT_LOADER_INO || ino == EXT4_JOURNAL_INO || ino == EXT4_RESIZE_INO || (ino >= EXT4_FIRST_INO(sb) && @@ -1374,6 +1374,7 @@ enum { EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read nolocking */ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ + EXT4_STATE_ORDERED_MODE, /* data=ordered mode */ }; #define EXT4_INODE_BIT_FNS(name, field, offset) \ @@ -1784,9 +1785,6 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) */ #define ERR_BAD_DX_DIR -75000 -void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, - ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp); - /* * Timeout and state flag for lazy initialization inode thread. */ @@ -1908,6 +1906,13 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, struct buffer_head *bh); /* balloc.c */ +extern void ext4_get_group_no_and_offset(struct super_block *sb, + ext4_fsblk_t blocknr, + ext4_group_t *blockgrpp, + ext4_grpblk_t *offsetp); +extern ext4_group_t ext4_get_group_number(struct super_block *sb, + ext4_fsblk_t block); + extern void ext4_validate_block_bitmap(struct super_block *sb, struct ext4_group_desc *desc, unsigned int block_group, @@ -2108,8 +2113,9 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, unsigned long nr_segs); extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); -extern void ext4_ind_truncate(struct inode *inode); -extern int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length); +extern void ext4_ind_truncate(handle_t *, struct inode *inode); +extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, + ext4_lblk_t first, ext4_lblk_t stop); /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); @@ -2117,6 +2123,7 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); /* migrate.c */ extern int ext4_ext_migrate(struct inode *); +extern int ext4_ind_migrate(struct inode *inode); /* namei.c */ extern int ext4_dirent_csum_verify(struct inode *inode, @@ -2511,6 +2518,11 @@ extern int ext4_try_create_inline_dir(handle_t *handle, extern int ext4_read_inline_dir(struct file *filp, void *dirent, filldir_t filldir, int *has_inline_data); +extern int htree_inlinedir_to_tree(struct file *dir_file, + struct inode *dir, ext4_lblk_t block, + struct dx_hash_info *hinfo, + __u32 start_hash, __u32 start_minor_hash, + int *has_inline_data); extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, const struct qstr *d_name, struct ext4_dir_entry_2 **res_dir, @@ -2547,6 +2559,24 @@ extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t, extern int ext4_handle_dirty_dirent_node(handle_t *handle, struct inode *inode, struct buffer_head *bh); +#define S_SHIFT 12 +static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, + [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK, + [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK, +}; + +static inline void ext4_set_de_type(struct super_block *sb, + struct ext4_dir_entry_2 *de, + umode_t mode) { + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE)) + de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; +} + /* symlink.c */ extern const struct inode_operations ext4_symlink_inode_operations; @@ -2573,9 +2603,9 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk); extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); -extern void ext4_ext_truncate(struct inode *); -extern int ext4_ext_punch_hole(struct file *file, loff_t offset, - loff_t length); +extern void ext4_ext_truncate(handle_t *, struct inode *); +extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end); extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); extern long ext4_fallocate(struct file *file, int mode, loff_t offset, @@ -2609,17 +2639,26 @@ extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, /* move_extent.c */ +extern void ext4_double_down_write_data_sem(struct inode *first, + struct inode *second); +extern void ext4_double_up_write_data_sem(struct inode *orig_inode, + struct inode *donor_inode); +void ext4_inode_double_lock(struct inode *inode1, struct inode *inode2); +void ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2); extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 start_orig, __u64 start_donor, __u64 len, __u64 *moved_len); /* page-io.c */ extern int __init ext4_init_pageio(void); -extern void ext4_add_complete_io(ext4_io_end_t *io_end); extern void ext4_exit_pageio(void); -extern void ext4_ioend_wait(struct inode *); -extern void ext4_free_io_end(ext4_io_end_t *io); +extern void ext4_ioend_shutdown(struct inode *); extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); +extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end); +extern int ext4_put_io_end(ext4_io_end_t *io_end); +extern void ext4_put_io_end_defer(ext4_io_end_t *io_end); +extern void ext4_io_submit_init(struct ext4_io_submit *io, + struct writeback_control *wbc); extern void ext4_end_io_work(struct work_struct *work); extern void ext4_io_submit(struct ext4_io_submit *io); extern int ext4_bio_write_page(struct ext4_io_submit *io, diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 8643ff5bbeb7..51bc821ade90 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -270,5 +270,10 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, 0xffff); } +#define ext4_ext_dirty(handle, inode, path) \ + __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) +int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, + struct inode *inode, struct ext4_ext_path *path); + #endif /* _EXT4_EXTENTS */ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 7058975e3a55..451eb4045330 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -43,6 +43,8 @@ handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, { journal_t *journal; + might_sleep(); + trace_ext4_journal_start(sb, nblocks, _RET_IP_); if (sb->s_flags & MS_RDONLY) return ERR_PTR(-EROFS); @@ -113,6 +115,8 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line, { int err = 0; + might_sleep(); + if (ext4_handle_valid(handle)) { err = jbd2_journal_get_write_access(handle, bh); if (err) @@ -209,6 +213,10 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, { int err = 0; + might_sleep(); + + set_buffer_meta(bh); + set_buffer_prio(bh); if (ext4_handle_valid(handle)) { err = jbd2_journal_dirty_metadata(handle, bh); if (err) { diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 4c216b1bf20c..c8c6885406db 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -29,11 +29,13 @@ * block to complete the transaction. * * For extents-enabled fs we may have to allocate and modify up to - * 5 levels of tree + root which are stored in the inode. */ + * 5 levels of tree, data block (for each of these we need bitmap + group + * summaries), root which is stored in the inode, sb + */ #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ - ? 27U : 8U) + ? 20U : 8U) /* Extended attribute operations touch at most two data buffers, * two bitmap buffers, and two group summaries, in addition to the inode @@ -194,16 +196,20 @@ static inline void ext4_journal_callback_add(handle_t *handle, * ext4_journal_callback_del: delete a registered callback * @handle: active journal transaction handle on which callback was registered * @jce: registered journal callback entry to unregister + * Return true if object was sucessfully removed */ -static inline void ext4_journal_callback_del(handle_t *handle, +static inline bool ext4_journal_callback_try_del(handle_t *handle, struct ext4_journal_cb_entry *jce) { + bool deleted; struct ext4_sb_info *sbi = EXT4_SB(handle->h_transaction->t_journal->j_private); spin_lock(&sbi->s_md_lock); + deleted = !list_empty(&jce->jce_list); list_del_init(&jce->jce_list); spin_unlock(&sbi->s_md_lock); + return deleted; } int diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 28dd8eeea6a9..107936db244e 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -157,11 +157,8 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode, * - ENOMEM * - EIO */ -#define ext4_ext_dirty(handle, inode, path) \ - __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) -static int __ext4_ext_dirty(const char *where, unsigned int line, - handle_t *handle, struct inode *inode, - struct ext4_ext_path *path) +int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, + struct inode *inode, struct ext4_ext_path *path) { int err; if (path->p_bh) { @@ -1584,10 +1581,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, unsigned short ext1_ee_len, ext2_ee_len, max_len; /* - * Make sure that either both extents are uninitialized, or - * both are _not_. + * Make sure that both extents are initialized. We don't merge + * uninitialized extents so that we can be sure that end_io code has + * the extent that was written properly split out and conversion to + * initialized is trivial. */ - if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2)) + if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2)) return 0; if (ext4_ext_is_uninitialized(ex1)) @@ -1811,39 +1810,101 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, } depth = ext_depth(inode); ex = path[depth].p_ext; + eh = path[depth].p_hdr; if (unlikely(path[depth].p_hdr == NULL)) { EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); return -EIO; } /* try to insert block into found extent and return */ - if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO) - && ext4_can_extents_be_merged(inode, ex, newext)) { - ext_debug("append [%d]%d block to %u:[%d]%d (from %llu)\n", - ext4_ext_is_uninitialized(newext), - ext4_ext_get_actual_len(newext), - le32_to_cpu(ex->ee_block), - ext4_ext_is_uninitialized(ex), - ext4_ext_get_actual_len(ex), - ext4_ext_pblock(ex)); - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - return err; + if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)) { /* - * ext4_can_extents_be_merged should have checked that either - * both extents are uninitialized, or both aren't. Thus we - * need to check only one of them here. + * Try to see whether we should rather test the extent on + * right from ex, or from the left of ex. This is because + * ext4_ext_find_extent() can return either extent on the + * left, or on the right from the searched position. This + * will make merging more effective. */ - if (ext4_ext_is_uninitialized(ex)) - uninitialized = 1; - ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + if (ex < EXT_LAST_EXTENT(eh) && + (le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex) < + le32_to_cpu(newext->ee_block))) { + ex += 1; + goto prepend; + } else if ((ex > EXT_FIRST_EXTENT(eh)) && + (le32_to_cpu(newext->ee_block) + + ext4_ext_get_actual_len(newext) < + le32_to_cpu(ex->ee_block))) + ex -= 1; + + /* Try to append newex to the ex */ + if (ext4_can_extents_be_merged(inode, ex, newext)) { + ext_debug("append [%d]%d block to %u:[%d]%d" + "(from %llu)\n", + ext4_ext_is_uninitialized(newext), + ext4_ext_get_actual_len(newext), + le32_to_cpu(ex->ee_block), + ext4_ext_is_uninitialized(ex), + ext4_ext_get_actual_len(ex), + ext4_ext_pblock(ex)); + err = ext4_ext_get_access(handle, inode, + path + depth); + if (err) + return err; + + /* + * ext4_can_extents_be_merged should have checked + * that either both extents are uninitialized, or + * both aren't. Thus we need to check only one of + * them here. + */ + if (ext4_ext_is_uninitialized(ex)) + uninitialized = 1; + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(newext)); - if (uninitialized) - ext4_ext_mark_uninitialized(ex); - eh = path[depth].p_hdr; - nearex = ex; - goto merge; + if (uninitialized) + ext4_ext_mark_uninitialized(ex); + eh = path[depth].p_hdr; + nearex = ex; + goto merge; + } + +prepend: + /* Try to prepend newex to the ex */ + if (ext4_can_extents_be_merged(inode, newext, ex)) { + ext_debug("prepend %u[%d]%d block to %u:[%d]%d" + "(from %llu)\n", + le32_to_cpu(newext->ee_block), + ext4_ext_is_uninitialized(newext), + ext4_ext_get_actual_len(newext), + le32_to_cpu(ex->ee_block), + ext4_ext_is_uninitialized(ex), + ext4_ext_get_actual_len(ex), + ext4_ext_pblock(ex)); + err = ext4_ext_get_access(handle, inode, + path + depth); + if (err) + return err; + + /* + * ext4_can_extents_be_merged should have checked + * that either both extents are uninitialized, or + * both aren't. Thus we need to check only one of + * them here. + */ + if (ext4_ext_is_uninitialized(ex)) + uninitialized = 1; + ex->ee_block = newext->ee_block; + ext4_ext_store_pblock(ex, ext4_ext_pblock(newext)); + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + + ext4_ext_get_actual_len(newext)); + if (uninitialized) + ext4_ext_mark_uninitialized(ex); + eh = path[depth].p_hdr; + nearex = ex; + goto merge; + } } depth = ext_depth(inode); @@ -1878,8 +1939,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, * There is no free space in the found leaf. * We're gonna add a new leaf in the tree. */ - if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) - flags = EXT4_MB_USE_ROOT_BLOCKS; + if (flag & EXT4_GET_BLOCKS_METADATA_NOFAIL) + flags = EXT4_MB_USE_RESERVED; err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext); if (err) goto cleanup; @@ -2597,8 +2658,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path) return 1; } -static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, - ext4_lblk_t end) +int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end) { struct super_block *sb = inode->i_sb; int depth = ext_depth(inode); @@ -2665,12 +2726,14 @@ again: /* * Split the extent in two so that 'end' is the last - * block in the first new extent + * block in the first new extent. Also we should not + * fail removing space due to ENOSPC so try to use + * reserved block if that happens. */ err = ext4_split_extent_at(handle, inode, path, - end + 1, split_flag, - EXT4_GET_BLOCKS_PRE_IO | - EXT4_GET_BLOCKS_PUNCH_OUT_EXT); + end + 1, split_flag, + EXT4_GET_BLOCKS_PRE_IO | + EXT4_GET_BLOCKS_METADATA_NOFAIL); if (err < 0) goto out; @@ -2923,7 +2986,7 @@ static int ext4_split_extent_at(handle_t *handle, { ext4_fsblk_t newblock; ext4_lblk_t ee_block; - struct ext4_extent *ex, newex, orig_ex; + struct ext4_extent *ex, newex, orig_ex, zero_ex; struct ext4_extent *ex2 = NULL; unsigned int ee_len, depth; int err = 0; @@ -2943,6 +3006,10 @@ static int ext4_split_extent_at(handle_t *handle, newblock = split - ee_block + ext4_ext_pblock(ex); BUG_ON(split < ee_block || split >= (ee_block + ee_len)); + BUG_ON(!ext4_ext_is_uninitialized(ex) && + split_flag & (EXT4_EXT_MAY_ZEROOUT | + EXT4_EXT_MARK_UNINIT1 | + EXT4_EXT_MARK_UNINIT2)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -2990,12 +3057,29 @@ static int ext4_split_extent_at(handle_t *handle, err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { - if (split_flag & EXT4_EXT_DATA_VALID1) + if (split_flag & EXT4_EXT_DATA_VALID1) { err = ext4_ext_zeroout(inode, ex2); - else + zero_ex.ee_block = ex2->ee_block; + zero_ex.ee_len = cpu_to_le16( + ext4_ext_get_actual_len(ex2)); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(ex2)); + } else { err = ext4_ext_zeroout(inode, ex); - } else + zero_ex.ee_block = ex->ee_block; + zero_ex.ee_len = cpu_to_le16( + ext4_ext_get_actual_len(ex)); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(ex)); + } + } else { err = ext4_ext_zeroout(inode, &orig_ex); + zero_ex.ee_block = orig_ex.ee_block; + zero_ex.ee_len = cpu_to_le16( + ext4_ext_get_actual_len(&orig_ex)); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(&orig_ex)); + } if (err) goto fix_extent_len; @@ -3003,6 +3087,12 @@ static int ext4_split_extent_at(handle_t *handle, ex->ee_len = cpu_to_le16(ee_len); ext4_ext_try_to_merge(handle, inode, path, ex); err = ext4_ext_dirty(handle, inode, path + path->p_depth); + if (err) + goto fix_extent_len; + + /* update extent status tree */ + err = ext4_es_zeroout(inode, &zero_ex); + goto out; } else if (err) goto fix_extent_len; @@ -3041,6 +3131,7 @@ static int ext4_split_extent(handle_t *handle, int err = 0; int uninitialized; int split_flag1, flags1; + int allocated = map->m_len; depth = ext_depth(inode); ex = path[depth].p_ext; @@ -3060,20 +3151,29 @@ static int ext4_split_extent(handle_t *handle, map->m_lblk + map->m_len, split_flag1, flags1); if (err) goto out; + } else { + allocated = ee_len - (map->m_lblk - ee_block); } - + /* + * Update path is required because previous ext4_split_extent_at() may + * result in split of original leaf or extent zeroout. + */ ext4_ext_drop_refs(path); path = ext4_ext_find_extent(inode, map->m_lblk, path); if (IS_ERR(path)) return PTR_ERR(path); + depth = ext_depth(inode); + ex = path[depth].p_ext; + uninitialized = ext4_ext_is_uninitialized(ex); + split_flag1 = 0; if (map->m_lblk >= ee_block) { - split_flag1 = split_flag & (EXT4_EXT_MAY_ZEROOUT | - EXT4_EXT_DATA_VALID2); - if (uninitialized) + split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; + if (uninitialized) { split_flag1 |= EXT4_EXT_MARK_UNINIT1; - if (split_flag & EXT4_EXT_MARK_UNINIT2) - split_flag1 |= EXT4_EXT_MARK_UNINIT2; + split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | + EXT4_EXT_MARK_UNINIT2); + } err = ext4_split_extent_at(handle, inode, path, map->m_lblk, split_flag1, flags); if (err) @@ -3082,7 +3182,7 @@ static int ext4_split_extent(handle_t *handle, ext4_ext_show_leaf(inode, path); out: - return err ? err : map->m_len; + return err ? err : allocated; } /* @@ -3108,35 +3208,36 @@ out: static int ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path *path) + struct ext4_ext_path *path, + int flags) { struct ext4_sb_info *sbi; struct ext4_extent_header *eh; struct ext4_map_blocks split_map; struct ext4_extent zero_ex; - struct ext4_extent *ex; + struct ext4_extent *ex, *abut_ex; ext4_lblk_t ee_block, eof_block; - unsigned int ee_len, depth; - int allocated, max_zeroout = 0; + unsigned int ee_len, depth, map_len = map->m_len; + int allocated = 0, max_zeroout = 0; int err = 0; int split_flag = 0; ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" "block %llu, max_blocks %u\n", inode->i_ino, - (unsigned long long)map->m_lblk, map->m_len); + (unsigned long long)map->m_lblk, map_len); sbi = EXT4_SB(inode->i_sb); eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; - if (eof_block < map->m_lblk + map->m_len) - eof_block = map->m_lblk + map->m_len; + if (eof_block < map->m_lblk + map_len) + eof_block = map->m_lblk + map_len; depth = ext_depth(inode); eh = path[depth].p_hdr; ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - allocated = ee_len - (map->m_lblk - ee_block); + zero_ex.ee_len = 0; trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); @@ -3146,77 +3247,121 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, /* * Attempt to transfer newly initialized blocks from the currently - * uninitialized extent to its left neighbor. This is much cheaper + * uninitialized extent to its neighbor. This is much cheaper * than an insertion followed by a merge as those involve costly - * memmove() calls. This is the common case in steady state for - * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append - * writes. + * memmove() calls. Transferring to the left is the common case in + * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE) + * followed by append writes. * * Limitations of the current logic: - * - L1: we only deal with writes at the start of the extent. - * The approach could be extended to writes at the end - * of the extent but this scenario was deemed less common. - * - L2: we do not deal with writes covering the whole extent. + * - L1: we do not deal with writes covering the whole extent. * This would require removing the extent if the transfer * is possible. - * - L3: we only attempt to merge with an extent stored in the + * - L2: we only attempt to merge with an extent stored in the * same extent tree node. */ - if ((map->m_lblk == ee_block) && /*L1*/ - (map->m_len < ee_len) && /*L2*/ - (ex > EXT_FIRST_EXTENT(eh))) { /*L3*/ - struct ext4_extent *prev_ex; + if ((map->m_lblk == ee_block) && + /* See if we can merge left */ + (map_len < ee_len) && /*L1*/ + (ex > EXT_FIRST_EXTENT(eh))) { /*L2*/ ext4_lblk_t prev_lblk; ext4_fsblk_t prev_pblk, ee_pblk; - unsigned int prev_len, write_len; + unsigned int prev_len; - prev_ex = ex - 1; - prev_lblk = le32_to_cpu(prev_ex->ee_block); - prev_len = ext4_ext_get_actual_len(prev_ex); - prev_pblk = ext4_ext_pblock(prev_ex); + abut_ex = ex - 1; + prev_lblk = le32_to_cpu(abut_ex->ee_block); + prev_len = ext4_ext_get_actual_len(abut_ex); + prev_pblk = ext4_ext_pblock(abut_ex); ee_pblk = ext4_ext_pblock(ex); - write_len = map->m_len; /* - * A transfer of blocks from 'ex' to 'prev_ex' is allowed + * A transfer of blocks from 'ex' to 'abut_ex' is allowed * upon those conditions: - * - C1: prev_ex is initialized, - * - C2: prev_ex is logically abutting ex, - * - C3: prev_ex is physically abutting ex, - * - C4: prev_ex can receive the additional blocks without + * - C1: abut_ex is initialized, + * - C2: abut_ex is logically abutting ex, + * - C3: abut_ex is physically abutting ex, + * - C4: abut_ex can receive the additional blocks without * overflowing the (initialized) length limit. */ - if ((!ext4_ext_is_uninitialized(prev_ex)) && /*C1*/ + if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/ ((prev_lblk + prev_len) == ee_block) && /*C2*/ ((prev_pblk + prev_len) == ee_pblk) && /*C3*/ - (prev_len < (EXT_INIT_MAX_LEN - write_len))) { /*C4*/ + (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; trace_ext4_ext_convert_to_initialized_fastpath(inode, - map, ex, prev_ex); + map, ex, abut_ex); - /* Shift the start of ex by 'write_len' blocks */ - ex->ee_block = cpu_to_le32(ee_block + write_len); - ext4_ext_store_pblock(ex, ee_pblk + write_len); - ex->ee_len = cpu_to_le16(ee_len - write_len); + /* Shift the start of ex by 'map_len' blocks */ + ex->ee_block = cpu_to_le32(ee_block + map_len); + ext4_ext_store_pblock(ex, ee_pblk + map_len); + ex->ee_len = cpu_to_le16(ee_len - map_len); ext4_ext_mark_uninitialized(ex); /* Restore the flag */ - /* Extend prev_ex by 'write_len' blocks */ - prev_ex->ee_len = cpu_to_le16(prev_len + write_len); + /* Extend abut_ex by 'map_len' blocks */ + abut_ex->ee_len = cpu_to_le16(prev_len + map_len); + + /* Result: number of initialized blocks past m_lblk */ + allocated = map_len; + } + } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) && + (map_len < ee_len) && /*L1*/ + ex < EXT_LAST_EXTENT(eh)) { /*L2*/ + /* See if we can merge right */ + ext4_lblk_t next_lblk; + ext4_fsblk_t next_pblk, ee_pblk; + unsigned int next_len; + + abut_ex = ex + 1; + next_lblk = le32_to_cpu(abut_ex->ee_block); + next_len = ext4_ext_get_actual_len(abut_ex); + next_pblk = ext4_ext_pblock(abut_ex); + ee_pblk = ext4_ext_pblock(ex); + + /* + * A transfer of blocks from 'ex' to 'abut_ex' is allowed + * upon those conditions: + * - C1: abut_ex is initialized, + * - C2: abut_ex is logically abutting ex, + * - C3: abut_ex is physically abutting ex, + * - C4: abut_ex can receive the additional blocks without + * overflowing the (initialized) length limit. + */ + if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/ + ((map->m_lblk + map_len) == next_lblk) && /*C2*/ + ((ee_pblk + ee_len) == next_pblk) && /*C3*/ + (next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + + trace_ext4_ext_convert_to_initialized_fastpath(inode, + map, ex, abut_ex); - /* Mark the block containing both extents as dirty */ - ext4_ext_dirty(handle, inode, path + depth); + /* Shift the start of abut_ex by 'map_len' blocks */ + abut_ex->ee_block = cpu_to_le32(next_lblk - map_len); + ext4_ext_store_pblock(abut_ex, next_pblk - map_len); + ex->ee_len = cpu_to_le16(ee_len - map_len); + ext4_ext_mark_uninitialized(ex); /* Restore the flag */ - /* Update path to point to the right extent */ - path[depth].p_ext = prev_ex; + /* Extend abut_ex by 'map_len' blocks */ + abut_ex->ee_len = cpu_to_le16(next_len + map_len); /* Result: number of initialized blocks past m_lblk */ - allocated = write_len; - goto out; + allocated = map_len; } } + if (allocated) { + /* Mark the block containing both extents as dirty */ + ext4_ext_dirty(handle, inode, path + depth); + + /* Update path to point to the right extent */ + path[depth].p_ext = abut_ex; + goto out; + } else + allocated = ee_len - (map->m_lblk - ee_block); WARN_ON(map->m_lblk < ee_block); /* @@ -3227,13 +3372,16 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, if (EXT4_EXT_MAY_ZEROOUT & split_flag) max_zeroout = sbi->s_extent_max_zeroout_kb >> - inode->i_sb->s_blocksize_bits; + (inode->i_sb->s_blocksize_bits - 10); /* If extent is less than s_max_zeroout_kb, zeroout directly */ if (max_zeroout && (ee_len <= max_zeroout)) { err = ext4_ext_zeroout(inode, ex); if (err) goto out; + zero_ex.ee_block = ex->ee_block; + zero_ex.ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)); + ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -3287,11 +3435,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, } allocated = ext4_split_extent(handle, inode, path, - &split_map, split_flag, 0); + &split_map, split_flag, flags); if (allocated < 0) err = allocated; out: + /* If we have gotten a failure, don't zero out status tree */ + if (!err) + err = ext4_es_zeroout(inode, &zero_ex); return err ? err : allocated; } @@ -3374,8 +3525,19 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, "block %llu, max_blocks %u\n", inode->i_ino, (unsigned long long)ee_block, ee_len); - /* If extent is larger than requested then split is required */ + /* If extent is larger than requested it is a clear sign that we still + * have some extent state machine issues left. So extent_split is still + * required. + * TODO: Once all related issues will be fixed this situation should be + * illegal. + */ if (ee_block != map->m_lblk || ee_len > map->m_len) { +#ifdef EXT4_DEBUG + ext4_warning("Inode (%ld) finished: extent logical block %llu," + " len %u; IO logical block %llu, len %u\n", + inode->i_ino, (unsigned long long)ee_block, ee_len, + (unsigned long long)map->m_lblk, map->m_len); +#endif err = ext4_split_unwritten_extents(handle, inode, map, path, EXT4_GET_BLOCKS_CONVERT); if (err < 0) @@ -3593,6 +3755,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, flags, allocated); ext4_ext_show_leaf(inode, path); + /* + * When writing into uninitialized space, we should not fail to + * allocate metadata blocks for the new extent block if needed. + */ + flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL; + trace_ext4_ext_handle_uninitialized_extents(inode, map, flags, allocated, newblock); @@ -3626,6 +3794,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, path, map->m_len); } else err = ret; + map->m_flags |= EXT4_MAP_MAPPED; + if (allocated > map->m_len) + allocated = map->m_len; + map->m_len = allocated; goto out2; } /* buffered IO case */ @@ -3652,7 +3824,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, } /* buffered write, writepage time, convert*/ - ret = ext4_ext_convert_to_initialized(handle, inode, map, path); + ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags); if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); out: @@ -3675,6 +3847,7 @@ out: allocated - map->m_len); allocated = map->m_len; } + map->m_len = allocated; /* * If we have done fallocate with the offset that is already @@ -4106,9 +4279,6 @@ got_allocated_blocks: } } else { BUG_ON(allocated_clusters < reserved_clusters); - /* We will claim quota for all newly allocated blocks.*/ - ext4_da_update_reserve_space(inode, allocated_clusters, - 1); if (reserved_clusters < allocated_clusters) { struct ext4_inode_info *ei = EXT4_I(inode); int reservation = allocated_clusters - @@ -4159,6 +4329,15 @@ got_allocated_blocks: ei->i_reserved_data_blocks += reservation; spin_unlock(&ei->i_block_reservation_lock); } + /* + * We will claim quota for all newly allocated blocks. + * We're updating the reserved space *after* the + * correction above so we do not accidentally free + * all the metadata reservation because we might + * actually need it later on. + */ + ext4_da_update_reserve_space(inode, allocated_clusters, + 1); } } @@ -4189,48 +4368,13 @@ out3: return err ? err : allocated; } -void ext4_ext_truncate(struct inode *inode) +void ext4_ext_truncate(handle_t *handle, struct inode *inode) { - struct address_space *mapping = inode->i_mapping; struct super_block *sb = inode->i_sb; ext4_lblk_t last_block; - handle_t *handle; - loff_t page_len; int err = 0; /* - * finish any pending end_io work so we won't run the risk of - * converting any truncated blocks to initialized later - */ - ext4_flush_unwritten_io(inode); - - /* - * probably first extent we're gonna free will be last in block - */ - err = ext4_writepage_trans_blocks(inode); - handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, err); - if (IS_ERR(handle)) - return; - - if (inode->i_size % PAGE_CACHE_SIZE != 0) { - page_len = PAGE_CACHE_SIZE - - (inode->i_size & (PAGE_CACHE_SIZE - 1)); - - err = ext4_discard_partial_page_buffers(handle, - mapping, inode->i_size, page_len, 0); - - if (err) - goto out_stop; - } - - if (ext4_orphan_add(handle, inode)) - goto out_stop; - - down_write(&EXT4_I(inode)->i_data_sem); - - ext4_discard_preallocations(inode); - - /* * TODO: optimization is possible here. * Probably we need not scan at all, * because page truncation is enough. @@ -4245,29 +4389,6 @@ void ext4_ext_truncate(struct inode *inode) err = ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); - - /* In a multi-transaction truncate, we only make the final - * transaction synchronous. - */ - if (IS_SYNC(inode)) - ext4_handle_sync(handle); - - up_write(&EXT4_I(inode)->i_data_sem); - -out_stop: - /* - * If this was a simple ftruncate() and the file will remain alive, - * then we need to clear up the orphan record which we created above. - * However, if this was a real unlink then we were called by - * ext4_delete_inode(), and we allow that function to clean up the - * orphan info for us. - */ - if (inode->i_nlink) - ext4_orphan_del(handle, inode); - - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - ext4_journal_stop(handle); } static void ext4_falloc_update_inode(struct inode *inode, @@ -4368,8 +4489,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (len <= EXT_UNINIT_MAX_LEN << blkbits) flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; - /* Prevent race condition between unwritten */ - ext4_flush_unwritten_io(inode); retry: while (ret >= 0 && ret < max_blocks) { map.m_lblk = map.m_lblk + ret; @@ -4557,187 +4676,6 @@ static int ext4_xattr_fiemap(struct inode *inode, return (error < 0 ? error : 0); } -/* - * ext4_ext_punch_hole - * - * Punches a hole of "length" bytes in a file starting - * at byte "offset" - * - * @inode: The inode of the file to punch a hole in - * @offset: The starting byte offset of the hole - * @length: The length of the hole - * - * Returns the number of blocks removed or negative on err - */ -int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) -{ - struct inode *inode = file_inode(file); - struct super_block *sb = inode->i_sb; - ext4_lblk_t first_block, stop_block; - struct address_space *mapping = inode->i_mapping; - handle_t *handle; - loff_t first_page, last_page, page_len; - loff_t first_page_offset, last_page_offset; - int credits, err = 0; - - /* - * Write out all dirty pages to avoid race conditions - * Then release them. - */ - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - err = filemap_write_and_wait_range(mapping, - offset, offset + length - 1); - - if (err) - return err; - } - - mutex_lock(&inode->i_mutex); - /* It's not possible punch hole on append only file */ - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { - err = -EPERM; - goto out_mutex; - } - if (IS_SWAPFILE(inode)) { - err = -ETXTBSY; - goto out_mutex; - } - - /* No need to punch hole beyond i_size */ - if (offset >= inode->i_size) - goto out_mutex; - - /* - * If the hole extends beyond i_size, set the hole - * to end after the page that contains i_size - */ - if (offset + length > inode->i_size) { - length = inode->i_size + - PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) - - offset; - } - - first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - last_page = (offset + length) >> PAGE_CACHE_SHIFT; - - first_page_offset = first_page << PAGE_CACHE_SHIFT; - last_page_offset = last_page << PAGE_CACHE_SHIFT; - - /* Now release the pages */ - if (last_page_offset > first_page_offset) { - truncate_pagecache_range(inode, first_page_offset, - last_page_offset - 1); - } - - /* Wait all existing dio workers, newcomers will block on i_mutex */ - ext4_inode_block_unlocked_dio(inode); - err = ext4_flush_unwritten_io(inode); - if (err) - goto out_dio; - inode_dio_wait(inode); - - credits = ext4_writepage_trans_blocks(inode); - handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - goto out_dio; - } - - - /* - * Now we need to zero out the non-page-aligned data in the - * pages at the start and tail of the hole, and unmap the buffer - * heads for the block aligned regions of the page that were - * completely zeroed. - */ - if (first_page > last_page) { - /* - * If the file space being truncated is contained within a page - * just zero out and unmap the middle of that page - */ - err = ext4_discard_partial_page_buffers(handle, - mapping, offset, length, 0); - - if (err) - goto out; - } else { - /* - * zero out and unmap the partial page that contains - * the start of the hole - */ - page_len = first_page_offset - offset; - if (page_len > 0) { - err = ext4_discard_partial_page_buffers(handle, mapping, - offset, page_len, 0); - if (err) - goto out; - } - - /* - * zero out and unmap the partial page that contains - * the end of the hole - */ - page_len = offset + length - last_page_offset; - if (page_len > 0) { - err = ext4_discard_partial_page_buffers(handle, mapping, - last_page_offset, page_len, 0); - if (err) - goto out; - } - } - - /* - * If i_size is contained in the last page, we need to - * unmap and zero the partial page after i_size - */ - if (inode->i_size >> PAGE_CACHE_SHIFT == last_page && - inode->i_size % PAGE_CACHE_SIZE != 0) { - - page_len = PAGE_CACHE_SIZE - - (inode->i_size & (PAGE_CACHE_SIZE - 1)); - - if (page_len > 0) { - err = ext4_discard_partial_page_buffers(handle, - mapping, inode->i_size, page_len, 0); - - if (err) - goto out; - } - } - - first_block = (offset + sb->s_blocksize - 1) >> - EXT4_BLOCK_SIZE_BITS(sb); - stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); - - /* If there are no blocks to remove, return now */ - if (first_block >= stop_block) - goto out; - - down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode); - - err = ext4_es_remove_extent(inode, first_block, - stop_block - first_block); - err = ext4_ext_remove_space(inode, first_block, stop_block - 1); - - ext4_discard_preallocations(inode); - - if (IS_SYNC(inode)) - ext4_handle_sync(handle); - - up_write(&EXT4_I(inode)->i_data_sem); - -out: - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - ext4_journal_stop(handle); -out_dio: - ext4_inode_resume_unlocked_dio(inode); -out_mutex: - mutex_unlock(&inode->i_mutex); - return err; -} - int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 95796a1b7522..fe3337a85ede 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -333,17 +333,27 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) static int ext4_es_can_be_merged(struct extent_status *es1, struct extent_status *es2) { - if (es1->es_lblk + es1->es_len != es2->es_lblk) + if (ext4_es_status(es1) != ext4_es_status(es2)) return 0; - if (ext4_es_status(es1) != ext4_es_status(es2)) + if (((__u64) es1->es_len) + es2->es_len > 0xFFFFFFFFULL) return 0; - if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) && - (ext4_es_pblock(es1) + es1->es_len != ext4_es_pblock(es2))) + if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk) return 0; - return 1; + if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) && + (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2))) + return 1; + + if (ext4_es_is_hole(es1)) + return 1; + + /* we need to check delayed extent is without unwritten status */ + if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1)) + return 1; + + return 0; } static struct extent_status * @@ -389,6 +399,179 @@ ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es) return es; } +#ifdef ES_AGGRESSIVE_TEST +static void ext4_es_insert_extent_ext_check(struct inode *inode, + struct extent_status *es) +{ + struct ext4_ext_path *path = NULL; + struct ext4_extent *ex; + ext4_lblk_t ee_block; + ext4_fsblk_t ee_start; + unsigned short ee_len; + int depth, ee_status, es_status; + + path = ext4_ext_find_extent(inode, es->es_lblk, NULL); + if (IS_ERR(path)) + return; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + + if (ex) { + + ee_block = le32_to_cpu(ex->ee_block); + ee_start = ext4_ext_pblock(ex); + ee_len = ext4_ext_get_actual_len(ex); + + ee_status = ext4_ext_is_uninitialized(ex) ? 1 : 0; + es_status = ext4_es_is_unwritten(es) ? 1 : 0; + + /* + * Make sure ex and es are not overlap when we try to insert + * a delayed/hole extent. + */ + if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) { + if (in_range(es->es_lblk, ee_block, ee_len)) { + pr_warn("ES insert assertation failed for " + "inode: %lu we can find an extent " + "at block [%d/%d/%llu/%c], but we " + "want to add an delayed/hole extent " + "[%d/%d/%llu/%llx]\n", + inode->i_ino, ee_block, ee_len, + ee_start, ee_status ? 'u' : 'w', + es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + } + goto out; + } + + /* + * We don't check ee_block == es->es_lblk, etc. because es + * might be a part of whole extent, vice versa. + */ + if (es->es_lblk < ee_block || + ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) { + pr_warn("ES insert assertation failed for inode: %lu " + "ex_status [%d/%d/%llu/%c] != " + "es_status [%d/%d/%llu/%c]\n", inode->i_ino, + ee_block, ee_len, ee_start, + ee_status ? 'u' : 'w', es->es_lblk, es->es_len, + ext4_es_pblock(es), es_status ? 'u' : 'w'); + goto out; + } + + if (ee_status ^ es_status) { + pr_warn("ES insert assertation failed for inode: %lu " + "ex_status [%d/%d/%llu/%c] != " + "es_status [%d/%d/%llu/%c]\n", inode->i_ino, + ee_block, ee_len, ee_start, + ee_status ? 'u' : 'w', es->es_lblk, es->es_len, + ext4_es_pblock(es), es_status ? 'u' : 'w'); + } + } else { + /* + * We can't find an extent on disk. So we need to make sure + * that we don't want to add an written/unwritten extent. + */ + if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) { + pr_warn("ES insert assertation failed for inode: %lu " + "can't find an extent at block %d but we want " + "to add an written/unwritten extent " + "[%d/%d/%llu/%llx]\n", inode->i_ino, + es->es_lblk, es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + } + } +out: + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } +} + +static void ext4_es_insert_extent_ind_check(struct inode *inode, + struct extent_status *es) +{ + struct ext4_map_blocks map; + int retval; + + /* + * Here we call ext4_ind_map_blocks to lookup a block mapping because + * 'Indirect' structure is defined in indirect.c. So we couldn't + * access direct/indirect tree from outside. It is too dirty to define + * this function in indirect.c file. + */ + + map.m_lblk = es->es_lblk; + map.m_len = es->es_len; + + retval = ext4_ind_map_blocks(NULL, inode, &map, 0); + if (retval > 0) { + if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) { + /* + * We want to add a delayed/hole extent but this + * block has been allocated. + */ + pr_warn("ES insert assertation failed for inode: %lu " + "We can find blocks but we want to add a " + "delayed/hole extent [%d/%d/%llu/%llx]\n", + inode->i_ino, es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + return; + } else if (ext4_es_is_written(es)) { + if (retval != es->es_len) { + pr_warn("ES insert assertation failed for " + "inode: %lu retval %d != es_len %d\n", + inode->i_ino, retval, es->es_len); + return; + } + if (map.m_pblk != ext4_es_pblock(es)) { + pr_warn("ES insert assertation failed for " + "inode: %lu m_pblk %llu != " + "es_pblk %llu\n", + inode->i_ino, map.m_pblk, + ext4_es_pblock(es)); + return; + } + } else { + /* + * We don't need to check unwritten extent because + * indirect-based file doesn't have it. + */ + BUG_ON(1); + } + } else if (retval == 0) { + if (ext4_es_is_written(es)) { + pr_warn("ES insert assertation failed for inode: %lu " + "We can't find the block but we want to add " + "an written extent [%d/%d/%llu/%llx]\n", + inode->i_ino, es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + return; + } + } +} + +static inline void ext4_es_insert_extent_check(struct inode *inode, + struct extent_status *es) +{ + /* + * We don't need to worry about the race condition because + * caller takes i_data_sem locking. + */ + BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + ext4_es_insert_extent_ext_check(inode, es); + else + ext4_es_insert_extent_ind_check(inode, es); +} +#else +static inline void ext4_es_insert_extent_check(struct inode *inode, + struct extent_status *es) +{ +} +#endif + static int __es_insert_extent(struct inode *inode, struct extent_status *newes) { struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; @@ -471,6 +654,8 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_es_store_status(&newes, status); trace_ext4_es_insert_extent(inode, &newes); + ext4_es_insert_extent_check(inode, &newes); + write_lock(&EXT4_I(inode)->i_es_lock); err = __es_remove_extent(inode, lblk, end); if (err != 0) @@ -669,6 +854,23 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, return err; } +int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex) +{ + ext4_lblk_t ee_block; + ext4_fsblk_t ee_pblock; + unsigned int ee_len; + + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + ee_pblock = ext4_ext_pblock(ex); + + if (ee_len == 0) + return 0; + + return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, + EXTENT_STATUS_WRITTEN); +} + static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) { struct ext4_sb_info *sbi = container_of(shrink, diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index f190dfe969da..d8e2d4dc311e 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -21,6 +21,12 @@ #endif /* + * With ES_AGGRESSIVE_TEST defined, the result of es caching will be + * checked with old map_block's result. + */ +#define ES_AGGRESSIVE_TEST__ + +/* * These flags live in the high bits of extent_status.es_pblk */ #define EXTENT_STATUS_WRITTEN (1ULL << 63) @@ -33,6 +39,8 @@ EXTENT_STATUS_DELAYED | \ EXTENT_STATUS_HOLE) +struct ext4_extent; + struct extent_status { struct rb_node rb_node; ext4_lblk_t es_lblk; /* first logical block extent covers */ @@ -58,6 +66,7 @@ extern void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es); extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es); +extern int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex); static inline int ext4_es_is_written(struct extent_status *es) { diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 64848b595b24..4959e29573b6 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -23,6 +23,7 @@ #include <linux/jbd2.h> #include <linux/mount.h> #include <linux/path.h> +#include <linux/aio.h> #include <linux/quotaops.h> #include <linux/pagevec.h> #include "ext4.h" diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 3278e64e57b6..e0ba8a408def 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -166,8 +166,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (journal->j_flags & JBD2_BARRIER && !jbd2_trans_will_send_data_barrier(journal, commit_tid)) needs_barrier = true; - jbd2_log_start_commit(journal, commit_tid); - ret = jbd2_log_wait_commit(journal, commit_tid); + ret = jbd2_complete_transaction(journal, commit_tid); if (needs_barrier) { err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); if (!ret) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 32fd2b9075dd..00a818d67b54 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -166,7 +166,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) trace_ext4_load_inode_bitmap(sb, block_group); bh->b_end_io = ext4_end_bitmap_read; get_bh(bh); - submit_bh(READ, bh); + submit_bh(READ | REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { put_bh(bh); @@ -324,8 +324,8 @@ error_return: } struct orlov_stats { + __u64 free_clusters; __u32 free_inodes; - __u32 free_clusters; __u32 used_dirs; }; @@ -342,7 +342,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g, if (flex_size > 1) { stats->free_inodes = atomic_read(&flex_group[g].free_inodes); - stats->free_clusters = atomic_read(&flex_group[g].free_clusters); + stats->free_clusters = atomic64_read(&flex_group[g].free_clusters); stats->used_dirs = atomic_read(&flex_group[g].used_dirs); return; } @@ -666,6 +666,23 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, ei = EXT4_I(inode); sbi = EXT4_SB(sb); + /* + * Initalize owners and quota early so that we don't have to account + * for quota initialization worst case in standard inode creating + * transaction + */ + if (owner) { + inode->i_mode = mode; + i_uid_write(inode, owner[0]); + i_gid_write(inode, owner[1]); + } else if (test_opt(sb, GRPID)) { + inode->i_mode = mode; + inode->i_uid = current_fsuid(); + inode->i_gid = dir->i_gid; + } else + inode_init_owner(inode, dir, mode); + dquot_initialize(inode); + if (!goal) goal = sbi->s_inode_goal; @@ -697,7 +714,7 @@ got_group: gdp = ext4_get_group_desc(sb, group, &group_desc_bh); if (!gdp) - goto fail; + goto out; /* * Check free inodes count before loading bitmap. @@ -711,7 +728,7 @@ got_group: brelse(inode_bitmap_bh); inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); if (!inode_bitmap_bh) - goto fail; + goto out; repeat_in_this_group: ino = ext4_find_next_zero_bit((unsigned long *) @@ -733,13 +750,16 @@ repeat_in_this_group: handle_type, nblocks); if (IS_ERR(handle)) { err = PTR_ERR(handle); - goto fail; + ext4_std_error(sb, err); + goto out; } } BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode_bitmap_bh); - if (err) - goto fail; + if (err) { + ext4_std_error(sb, err); + goto out; + } ext4_lock_group(sb, group); ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data); ext4_unlock_group(sb, group); @@ -755,8 +775,10 @@ repeat_in_this_group: got: BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh); - if (err) - goto fail; + if (err) { + ext4_std_error(sb, err); + goto out; + } /* We may have to initialize the block bitmap if it isn't already */ if (ext4_has_group_desc_csum(sb) && @@ -768,7 +790,8 @@ got: err = ext4_journal_get_write_access(handle, block_bitmap_bh); if (err) { brelse(block_bitmap_bh); - goto fail; + ext4_std_error(sb, err); + goto out; } BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); @@ -787,14 +810,18 @@ got: ext4_unlock_group(sb, group); brelse(block_bitmap_bh); - if (err) - goto fail; + if (err) { + ext4_std_error(sb, err); + goto out; + } } BUFFER_TRACE(group_desc_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, group_desc_bh); - if (err) - goto fail; + if (err) { + ext4_std_error(sb, err); + goto out; + } /* Update the relevant bg descriptor fields */ if (ext4_has_group_desc_csum(sb)) { @@ -840,8 +867,10 @@ got: BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); - if (err) - goto fail; + if (err) { + ext4_std_error(sb, err); + goto out; + } percpu_counter_dec(&sbi->s_freeinodes_counter); if (S_ISDIR(mode)) @@ -851,16 +880,6 @@ got: flex_group = ext4_flex_group(sbi, group); atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes); } - if (owner) { - inode->i_mode = mode; - i_uid_write(inode, owner[0]); - i_gid_write(inode, owner[1]); - } else if (test_opt(sb, GRPID)) { - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = dir->i_gid; - } else - inode_init_owner(inode, dir, mode); inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); /* This is the optimal IO size (for stat), not the fs block size */ @@ -889,7 +908,9 @@ got: * twice. */ err = -EIO; - goto fail; + ext4_error(sb, "failed to insert inode %lu: doubly allocated?", + inode->i_ino); + goto out; } spin_lock(&sbi->s_next_gen_lock); inode->i_generation = sbi->s_next_generation++; @@ -899,7 +920,6 @@ got: if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { __u32 csum; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = cpu_to_le32(inode->i_generation); csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, @@ -918,7 +938,6 @@ got: ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); ret = inode; - dquot_initialize(inode); err = dquot_alloc_inode(inode); if (err) goto fail_drop; @@ -952,24 +971,17 @@ got: ext4_debug("allocating inode %lu\n", inode->i_ino); trace_ext4_allocate_inode(inode, dir, mode); - goto really_out; -fail: - ext4_std_error(sb, err); -out: - iput(inode); - ret = ERR_PTR(err); -really_out: brelse(inode_bitmap_bh); return ret; fail_free_drop: dquot_free_inode(inode); - fail_drop: - dquot_drop(inode); - inode->i_flags |= S_NOQUOTA; clear_nlink(inode); unlock_new_inode(inode); +out: + dquot_drop(inode); + inode->i_flags |= S_NOQUOTA; iput(inode); brelse(inode_bitmap_bh); return ERR_PTR(err); diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index b505a145a593..b8d5d351e24f 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -20,6 +20,7 @@ * (sct@redhat.com), 1993, 1998 */ +#include <linux/aio.h> #include "ext4_jbd2.h" #include "truncate.h" #include "ext4_extents.h" /* Needed for EXT_MAX_BLOCKS */ @@ -292,131 +293,6 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, } /** - * ext4_alloc_blocks: multiple allocate blocks needed for a branch - * @handle: handle for this transaction - * @inode: inode which needs allocated blocks - * @iblock: the logical block to start allocated at - * @goal: preferred physical block of allocation - * @indirect_blks: the number of blocks need to allocate for indirect - * blocks - * @blks: number of desired blocks - * @new_blocks: on return it will store the new block numbers for - * the indirect blocks(if needed) and the first direct block, - * @err: on return it will store the error code - * - * This function will return the number of blocks allocated as - * requested by the passed-in parameters. - */ -static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, ext4_fsblk_t goal, - int indirect_blks, int blks, - ext4_fsblk_t new_blocks[4], int *err) -{ - struct ext4_allocation_request ar; - int target, i; - unsigned long count = 0, blk_allocated = 0; - int index = 0; - ext4_fsblk_t current_block = 0; - int ret = 0; - - /* - * Here we try to allocate the requested multiple blocks at once, - * on a best-effort basis. - * To build a branch, we should allocate blocks for - * the indirect blocks(if not allocated yet), and at least - * the first direct block of this branch. That's the - * minimum number of blocks need to allocate(required) - */ - /* first we try to allocate the indirect blocks */ - target = indirect_blks; - while (target > 0) { - count = target; - /* allocating blocks for indirect blocks and direct blocks */ - current_block = ext4_new_meta_blocks(handle, inode, goal, - 0, &count, err); - if (*err) - goto failed_out; - - if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { - EXT4_ERROR_INODE(inode, - "current_block %llu + count %lu > %d!", - current_block, count, - EXT4_MAX_BLOCK_FILE_PHYS); - *err = -EIO; - goto failed_out; - } - - target -= count; - /* allocate blocks for indirect blocks */ - while (index < indirect_blks && count) { - new_blocks[index++] = current_block++; - count--; - } - if (count > 0) { - /* - * save the new block number - * for the first direct block - */ - new_blocks[index] = current_block; - WARN(1, KERN_INFO "%s returned more blocks than " - "requested\n", __func__); - break; - } - } - - target = blks - count ; - blk_allocated = count; - if (!target) - goto allocated; - /* Now allocate data blocks */ - memset(&ar, 0, sizeof(ar)); - ar.inode = inode; - ar.goal = goal; - ar.len = target; - ar.logical = iblock; - if (S_ISREG(inode->i_mode)) - /* enable in-core preallocation only for regular files */ - ar.flags = EXT4_MB_HINT_DATA; - - current_block = ext4_mb_new_blocks(handle, &ar, err); - if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { - EXT4_ERROR_INODE(inode, - "current_block %llu + ar.len %d > %d!", - current_block, ar.len, - EXT4_MAX_BLOCK_FILE_PHYS); - *err = -EIO; - goto failed_out; - } - - if (*err && (target == blks)) { - /* - * if the allocation failed and we didn't allocate - * any blocks before - */ - goto failed_out; - } - if (!*err) { - if (target == blks) { - /* - * save the new block number - * for the first direct block - */ - new_blocks[index] = current_block; - } - blk_allocated += ar.len; - } -allocated: - /* total number of blocks allocated for direct blocks */ - ret = blk_allocated; - *err = 0; - return ret; -failed_out: - for (i = 0; i < index; i++) - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); - return ret; -} - -/** * ext4_alloc_branch - allocate and set up a chain of blocks. * @handle: handle for this transaction * @inode: owner @@ -448,60 +324,59 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, int *blks, ext4_fsblk_t goal, ext4_lblk_t *offsets, Indirect *branch) { - int blocksize = inode->i_sb->s_blocksize; - int i, n = 0; - int err = 0; - struct buffer_head *bh; - int num; - ext4_fsblk_t new_blocks[4]; - ext4_fsblk_t current_block; - - num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, - *blks, new_blocks, &err); - if (err) - return err; + struct ext4_allocation_request ar; + struct buffer_head * bh; + ext4_fsblk_t b, new_blocks[4]; + __le32 *p; + int i, j, err, len = 1; - branch[0].key = cpu_to_le32(new_blocks[0]); /* - * metadata blocks and data blocks are allocated. + * Set up for the direct block allocation */ - for (n = 1; n <= indirect_blks; n++) { - /* - * Get buffer_head for parent block, zero it out - * and set the pointer to new one, then send - * parent to disk. - */ - bh = sb_getblk(inode->i_sb, new_blocks[n-1]); + memset(&ar, 0, sizeof(ar)); + ar.inode = inode; + ar.len = *blks; + ar.logical = iblock; + if (S_ISREG(inode->i_mode)) + ar.flags = EXT4_MB_HINT_DATA; + + for (i = 0; i <= indirect_blks; i++) { + if (i == indirect_blks) { + ar.goal = goal; + new_blocks[i] = ext4_mb_new_blocks(handle, &ar, &err); + } else + goal = new_blocks[i] = ext4_new_meta_blocks(handle, inode, + goal, 0, NULL, &err); + if (err) { + i--; + goto failed; + } + branch[i].key = cpu_to_le32(new_blocks[i]); + if (i == 0) + continue; + + bh = branch[i].bh = sb_getblk(inode->i_sb, new_blocks[i-1]); if (unlikely(!bh)) { err = -ENOMEM; goto failed; } - - branch[n].bh = bh; lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); err = ext4_journal_get_create_access(handle, bh); if (err) { - /* Don't brelse(bh) here; it's done in - * ext4_journal_forget() below */ unlock_buffer(bh); goto failed; } - memset(bh->b_data, 0, blocksize); - branch[n].p = (__le32 *) bh->b_data + offsets[n]; - branch[n].key = cpu_to_le32(new_blocks[n]); - *branch[n].p = branch[n].key; - if (n == indirect_blks) { - current_block = new_blocks[n]; - /* - * End of chain, update the last new metablock of - * the chain to point to the new allocated - * data blocks numbers - */ - for (i = 1; i < num; i++) - *(branch[n].p + i) = cpu_to_le32(++current_block); - } + memset(bh->b_data, 0, bh->b_size); + p = branch[i].p = (__le32 *) bh->b_data + offsets[i]; + b = new_blocks[i]; + + if (i == indirect_blks) + len = ar.len; + for (j = 0; j < len; j++) + *p++ = cpu_to_le32(b++); + BUFFER_TRACE(bh, "marking uptodate"); set_buffer_uptodate(bh); unlock_buffer(bh); @@ -511,25 +386,16 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, if (err) goto failed; } - *blks = num; - return err; + *blks = ar.len; + return 0; failed: - /* Allocation failed, free what we already allocated */ - ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); - for (i = 1; i <= n ; i++) { - /* - * branch[i].bh is newly allocated, so there is no - * need to revoke the block, which is why we don't - * need to set EXT4_FREE_BLOCKS_METADATA. - */ - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, - EXT4_FREE_BLOCKS_FORGET); + for (; i >= 0; i--) { + if (i != indirect_blks && branch[i].bh) + ext4_forget(handle, 1, inode, branch[i].bh, + branch[i].bh->b_blocknr); + ext4_free_blocks(handle, inode, NULL, new_blocks[i], + (i == indirect_blks) ? ar.len : 1, 0); } - for (i = n+1; i < indirect_blks; i++) - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); - - ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); - return err; } @@ -941,26 +807,9 @@ int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk) * be able to restart the transaction at a conventient checkpoint to make * sure we don't overflow the journal. * - * start_transaction gets us a new handle for a truncate transaction, - * and extend_transaction tries to extend the existing one a bit. If + * Try to extend this transaction for the purposes of truncation. If * extend fails, we need to propagate the failure up and restart the * transaction in the top-level truncate loop. --sct - */ -static handle_t *start_transaction(struct inode *inode) -{ - handle_t *result; - - result = ext4_journal_start(inode, EXT4_HT_TRUNCATE, - ext4_blocks_for_truncate(inode)); - if (!IS_ERR(result)) - return result; - - ext4_std_error(inode->i_sb, PTR_ERR(result)); - return result; -} - -/* - * Try to extend this transaction for the purposes of truncation. * * Returns 0 if we managed to create more room. If we can't create more * room, and the transaction must be restarted we return 1. @@ -1353,68 +1202,30 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, } } -void ext4_ind_truncate(struct inode *inode) +void ext4_ind_truncate(handle_t *handle, struct inode *inode) { - handle_t *handle; struct ext4_inode_info *ei = EXT4_I(inode); __le32 *i_data = ei->i_data; int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); - struct address_space *mapping = inode->i_mapping; ext4_lblk_t offsets[4]; Indirect chain[4]; Indirect *partial; __le32 nr = 0; int n = 0; ext4_lblk_t last_block, max_block; - loff_t page_len; unsigned blocksize = inode->i_sb->s_blocksize; - int err; - - handle = start_transaction(inode); - if (IS_ERR(handle)) - return; /* AKPM: return what? */ last_block = (inode->i_size + blocksize-1) >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); - if (inode->i_size % PAGE_CACHE_SIZE != 0) { - page_len = PAGE_CACHE_SIZE - - (inode->i_size & (PAGE_CACHE_SIZE - 1)); - - err = ext4_discard_partial_page_buffers(handle, - mapping, inode->i_size, page_len, 0); - - if (err) - goto out_stop; - } - if (last_block != max_block) { n = ext4_block_to_path(inode, last_block, offsets, NULL); if (n == 0) - goto out_stop; /* error */ + return; } - /* - * OK. This truncate is going to happen. We add the inode to the - * orphan list, so that if this truncate spans multiple transactions, - * and we crash, we will resume the truncate when the filesystem - * recovers. It also marks the inode dirty, to catch the new size. - * - * Implication: the file must always be in a sane, consistent - * truncatable state while each transaction commits. - */ - if (ext4_orphan_add(handle, inode)) - goto out_stop; - - /* - * From here we block out all ext4_get_block() callers who want to - * modify the block allocation tree. - */ - down_write(&ei->i_data_sem); - - ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); /* @@ -1431,7 +1242,7 @@ void ext4_ind_truncate(struct inode *inode) * It is unnecessary to free any data blocks if last_block is * equal to the indirect block limit. */ - goto out_unlock; + return; } else if (n == 1) { /* direct blocks */ ext4_free_data(handle, inode, NULL, i_data+offsets[0], i_data + EXT4_NDIR_BLOCKS); @@ -1491,31 +1302,6 @@ do_indirects: case EXT4_TIND_BLOCK: ; } - -out_unlock: - up_write(&ei->i_data_sem); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - - /* - * In a multi-transaction truncate, we only make the final transaction - * synchronous - */ - if (IS_SYNC(inode)) - ext4_handle_sync(handle); -out_stop: - /* - * If this was a simple ftruncate(), and the file will remain alive - * then we need to clear up the orphan record which we created above. - * However, if this was a real unlink then we were called by - * ext4_delete_inode(), and we allow that function to clean up the - * orphan info for us. - */ - if (inode->i_nlink) - ext4_orphan_del(handle, inode); - - ext4_journal_stop(handle); - trace_ext4_truncate_exit(inode); } static int free_hole_blocks(handle_t *handle, struct inode *inode, @@ -1539,9 +1325,9 @@ static int free_hole_blocks(handle_t *handle, struct inode *inode, blk = *i_data; if (level > 0) { ext4_lblk_t first2; - bh = sb_bread(inode->i_sb, blk); + bh = sb_bread(inode->i_sb, le32_to_cpu(blk)); if (!bh) { - EXT4_ERROR_INODE_BLOCK(inode, blk, + EXT4_ERROR_INODE_BLOCK(inode, le32_to_cpu(blk), "Read failure"); return -EIO; } @@ -1569,8 +1355,8 @@ err: return ret; } -static int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t first, ext4_lblk_t stop) +int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, + ext4_lblk_t first, ext4_lblk_t stop) { int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); int level, ret = 0; @@ -1604,157 +1390,3 @@ err: return ret; } -int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length) -{ - struct inode *inode = file_inode(file); - struct super_block *sb = inode->i_sb; - ext4_lblk_t first_block, stop_block; - struct address_space *mapping = inode->i_mapping; - handle_t *handle = NULL; - loff_t first_page, last_page, page_len; - loff_t first_page_offset, last_page_offset; - int err = 0; - - /* - * Write out all dirty pages to avoid race conditions - * Then release them. - */ - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - err = filemap_write_and_wait_range(mapping, - offset, offset + length - 1); - if (err) - return err; - } - - mutex_lock(&inode->i_mutex); - /* It's not possible punch hole on append only file */ - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { - err = -EPERM; - goto out_mutex; - } - if (IS_SWAPFILE(inode)) { - err = -ETXTBSY; - goto out_mutex; - } - - /* No need to punch hole beyond i_size */ - if (offset >= inode->i_size) - goto out_mutex; - - /* - * If the hole extents beyond i_size, set the hole - * to end after the page that contains i_size - */ - if (offset + length > inode->i_size) { - length = inode->i_size + - PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) - - offset; - } - - first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - last_page = (offset + length) >> PAGE_CACHE_SHIFT; - - first_page_offset = first_page << PAGE_CACHE_SHIFT; - last_page_offset = last_page << PAGE_CACHE_SHIFT; - - /* Now release the pages */ - if (last_page_offset > first_page_offset) { - truncate_pagecache_range(inode, first_page_offset, - last_page_offset - 1); - } - - /* Wait all existing dio works, newcomers will block on i_mutex */ - inode_dio_wait(inode); - - handle = start_transaction(inode); - if (IS_ERR(handle)) - goto out_mutex; - - /* - * Now we need to zero out the non-page-aligned data in the - * pages at the start and tail of the hole, and unmap the buffer - * heads for the block aligned regions of the page that were - * completely zerod. - */ - if (first_page > last_page) { - /* - * If the file space being truncated is contained within a page - * just zero out and unmap the middle of that page - */ - err = ext4_discard_partial_page_buffers(handle, - mapping, offset, length, 0); - if (err) - goto out; - } else { - /* - * Zero out and unmap the paritial page that contains - * the start of the hole - */ - page_len = first_page_offset - offset; - if (page_len > 0) { - err = ext4_discard_partial_page_buffers(handle, mapping, - offset, page_len, 0); - if (err) - goto out; - } - - /* - * Zero out and unmap the partial page that contains - * the end of the hole - */ - page_len = offset + length - last_page_offset; - if (page_len > 0) { - err = ext4_discard_partial_page_buffers(handle, mapping, - last_page_offset, page_len, 0); - if (err) - goto out; - } - } - - /* - * If i_size contained in the last page, we need to - * unmap and zero the paritial page after i_size - */ - if (inode->i_size >> PAGE_CACHE_SHIFT == last_page && - inode->i_size % PAGE_CACHE_SIZE != 0) { - page_len = PAGE_CACHE_SIZE - - (inode->i_size & (PAGE_CACHE_SIZE - 1)); - if (page_len > 0) { - err = ext4_discard_partial_page_buffers(handle, - mapping, inode->i_size, page_len, 0); - if (err) - goto out; - } - } - - first_block = (offset + sb->s_blocksize - 1) >> - EXT4_BLOCK_SIZE_BITS(sb); - stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); - - if (first_block >= stop_block) - goto out; - - down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode); - - err = ext4_es_remove_extent(inode, first_block, - stop_block - first_block); - err = ext4_free_hole_blocks(handle, inode, first_block, stop_block); - - ext4_discard_preallocations(inode); - - if (IS_SYNC(inode)) - ext4_handle_sync(handle); - - up_write(&EXT4_I(inode)->i_data_sem); - -out: - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - ext4_journal_stop(handle); - -out_mutex: - mutex_unlock(&inode->i_mutex); - - return err; -} diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index c0fd1a123f7d..3e2bf873e8a8 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -19,7 +19,8 @@ #define EXT4_XATTR_SYSTEM_DATA "data" #define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) -#define EXT4_INLINE_DOTDOT_SIZE 4 +#define EXT4_INLINE_DOTDOT_OFFSET 2 +#define EXT4_INLINE_DOTDOT_SIZE 4 int ext4_get_inline_size(struct inode *inode) { @@ -1289,6 +1290,120 @@ out: return ret; } +/* + * This function fills a red-black tree with information from an + * inlined dir. It returns the number directory entries loaded + * into the tree. If there is an error it is returned in err. + */ +int htree_inlinedir_to_tree(struct file *dir_file, + struct inode *dir, ext4_lblk_t block, + struct dx_hash_info *hinfo, + __u32 start_hash, __u32 start_minor_hash, + int *has_inline_data) +{ + int err = 0, count = 0; + unsigned int parent_ino; + int pos; + struct ext4_dir_entry_2 *de; + struct inode *inode = file_inode(dir_file); + int ret, inline_size = 0; + struct ext4_iloc iloc; + void *dir_buf = NULL; + struct ext4_dir_entry_2 fake; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + up_read(&EXT4_I(inode)->xattr_sem); + *has_inline_data = 0; + goto out; + } + + inline_size = ext4_get_inline_size(inode); + dir_buf = kmalloc(inline_size, GFP_NOFS); + if (!dir_buf) { + ret = -ENOMEM; + up_read(&EXT4_I(inode)->xattr_sem); + goto out; + } + + ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc); + up_read(&EXT4_I(inode)->xattr_sem); + if (ret < 0) + goto out; + + pos = 0; + parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); + while (pos < inline_size) { + /* + * As inlined dir doesn't store any information about '.' and + * only the inode number of '..' is stored, we have to handle + * them differently. + */ + if (pos == 0) { + fake.inode = cpu_to_le32(inode->i_ino); + fake.name_len = 1; + strcpy(fake.name, "."); + fake.rec_len = ext4_rec_len_to_disk( + EXT4_DIR_REC_LEN(fake.name_len), + inline_size); + ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); + de = &fake; + pos = EXT4_INLINE_DOTDOT_OFFSET; + } else if (pos == EXT4_INLINE_DOTDOT_OFFSET) { + fake.inode = cpu_to_le32(parent_ino); + fake.name_len = 2; + strcpy(fake.name, ".."); + fake.rec_len = ext4_rec_len_to_disk( + EXT4_DIR_REC_LEN(fake.name_len), + inline_size); + ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); + de = &fake; + pos = EXT4_INLINE_DOTDOT_SIZE; + } else { + de = (struct ext4_dir_entry_2 *)(dir_buf + pos); + pos += ext4_rec_len_from_disk(de->rec_len, inline_size); + if (ext4_check_dir_entry(inode, dir_file, de, + iloc.bh, dir_buf, + inline_size, pos)) { + ret = count; + goto out; + } + } + + ext4fs_dirhash(de->name, de->name_len, hinfo); + if ((hinfo->hash < start_hash) || + ((hinfo->hash == start_hash) && + (hinfo->minor_hash < start_minor_hash))) + continue; + if (de->inode == 0) + continue; + err = ext4_htree_store_dirent(dir_file, + hinfo->hash, hinfo->minor_hash, de); + if (err) { + count = err; + goto out; + } + count++; + } + ret = count; +out: + kfree(dir_buf); + brelse(iloc.bh); + return ret; +} + +/* + * So this function is called when the volume is mkfsed with + * dir_index disabled. In order to keep f_pos persistent + * after we convert from an inlined dir to a blocked based, + * we just pretend that we are a normal dir and return the + * offset as if '.' and '..' really take place. + * + */ int ext4_read_inline_dir(struct file *filp, void *dirent, filldir_t filldir, int *has_inline_data) @@ -1302,6 +1417,7 @@ int ext4_read_inline_dir(struct file *filp, int ret, inline_size = 0; struct ext4_iloc iloc; void *dir_buf = NULL; + int dotdot_offset, dotdot_size, extra_offset, extra_size; ret = ext4_get_inode_loc(inode, &iloc); if (ret) @@ -1330,8 +1446,21 @@ int ext4_read_inline_dir(struct file *filp, sb = inode->i_sb; stored = 0; parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); + offset = filp->f_pos; - while (!error && !stored && filp->f_pos < inode->i_size) { + /* + * dotdot_offset and dotdot_size is the real offset and + * size for ".." and "." if the dir is block based while + * the real size for them are only EXT4_INLINE_DOTDOT_SIZE. + * So we will use extra_offset and extra_size to indicate them + * during the inline dir iteration. + */ + dotdot_offset = EXT4_DIR_REC_LEN(1); + dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2); + extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE; + extra_size = extra_offset + inline_size; + + while (!error && !stored && filp->f_pos < extra_size) { revalidate: /* * If the version has changed since the last call to @@ -1340,15 +1469,23 @@ revalidate: * dir to make sure. */ if (filp->f_version != inode->i_version) { - for (i = 0; - i < inode->i_size && i < offset;) { + for (i = 0; i < extra_size && i < offset;) { + /* + * "." is with offset 0 and + * ".." is dotdot_offset. + */ if (!i) { - /* skip "." and ".." if needed. */ - i += EXT4_INLINE_DOTDOT_SIZE; + i = dotdot_offset; + continue; + } else if (i == dotdot_offset) { + i = dotdot_size; continue; } + /* for other entry, the real offset in + * the buf has to be tuned accordingly. + */ de = (struct ext4_dir_entry_2 *) - (dir_buf + i); + (dir_buf + i - extra_offset); /* It's too expensive to do a full * dirent test each time round this * loop, but we do have to test at @@ -1356,43 +1493,47 @@ revalidate: * failure will be detected in the * dirent test below. */ if (ext4_rec_len_from_disk(de->rec_len, - inline_size) < EXT4_DIR_REC_LEN(1)) + extra_size) < EXT4_DIR_REC_LEN(1)) break; i += ext4_rec_len_from_disk(de->rec_len, - inline_size); + extra_size); } offset = i; filp->f_pos = offset; filp->f_version = inode->i_version; } - while (!error && filp->f_pos < inode->i_size) { + while (!error && filp->f_pos < extra_size) { if (filp->f_pos == 0) { error = filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR); if (error) break; stored++; + filp->f_pos = dotdot_offset; + continue; + } - error = filldir(dirent, "..", 2, 0, parent_ino, - DT_DIR); + if (filp->f_pos == dotdot_offset) { + error = filldir(dirent, "..", 2, + dotdot_offset, + parent_ino, DT_DIR); if (error) break; stored++; - filp->f_pos = offset = EXT4_INLINE_DOTDOT_SIZE; + filp->f_pos = dotdot_size; continue; } - de = (struct ext4_dir_entry_2 *)(dir_buf + offset); + de = (struct ext4_dir_entry_2 *) + (dir_buf + filp->f_pos - extra_offset); if (ext4_check_dir_entry(inode, filp, de, iloc.bh, dir_buf, - inline_size, offset)) { + extra_size, filp->f_pos)) { ret = stored; goto out; } - offset += ext4_rec_len_from_disk(de->rec_len, - inline_size); if (le32_to_cpu(de->inode)) { /* We might block in the next section * if the data destination is @@ -1415,9 +1556,8 @@ revalidate: stored++; } filp->f_pos += ext4_rec_len_from_disk(de->rec_len, - inline_size); + extra_size); } - offset = 0; } out: kfree(dir_buf); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9ea0cde3fa9e..0723774bdfb5 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -37,6 +37,7 @@ #include <linux/printk.h> #include <linux/slab.h> #include <linux/ratelimit.h> +#include <linux/aio.h> #include "ext4_jbd2.h" #include "xattr.h" @@ -55,21 +56,21 @@ static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, __u16 csum_hi = 0; __u32 csum; - csum_lo = raw->i_checksum_lo; + csum_lo = le16_to_cpu(raw->i_checksum_lo); raw->i_checksum_lo = 0; if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { - csum_hi = raw->i_checksum_hi; + csum_hi = le16_to_cpu(raw->i_checksum_hi); raw->i_checksum_hi = 0; } csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, EXT4_INODE_SIZE(inode->i_sb)); - raw->i_checksum_lo = csum_lo; + raw->i_checksum_lo = cpu_to_le16(csum_lo); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) - raw->i_checksum_hi = csum_hi; + raw->i_checksum_hi = cpu_to_le16(csum_hi); return csum; } @@ -185,8 +186,6 @@ void ext4_evict_inode(struct inode *inode) trace_ext4_evict_inode(inode); - ext4_ioend_wait(inode); - if (inode->i_nlink) { /* * When journalling data dirty buffers are tracked only in the @@ -207,15 +206,16 @@ void ext4_evict_inode(struct inode *inode) * don't use page cache. */ if (ext4_should_journal_data(inode) && - (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && + inode->i_ino != EXT4_JOURNAL_INO) { journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; - jbd2_log_start_commit(journal, commit_tid); - jbd2_log_wait_commit(journal, commit_tid); + jbd2_complete_transaction(journal, commit_tid); filemap_write_and_wait(&inode->i_data); } truncate_inode_pages(&inode->i_data, 0); + ext4_ioend_shutdown(inode); goto no_delete; } @@ -225,6 +225,7 @@ void ext4_evict_inode(struct inode *inode) if (ext4_should_order_data(inode)) ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages(&inode->i_data, 0); + ext4_ioend_shutdown(inode); if (is_bad_inode(inode)) goto no_delete; @@ -482,6 +483,58 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, return num; } +#ifdef ES_AGGRESSIVE_TEST +static void ext4_map_blocks_es_recheck(handle_t *handle, + struct inode *inode, + struct ext4_map_blocks *es_map, + struct ext4_map_blocks *map, + int flags) +{ + int retval; + + map->m_flags = 0; + /* + * There is a race window that the result is not the same. + * e.g. xfstests #223 when dioread_nolock enables. The reason + * is that we lookup a block mapping in extent status tree with + * out taking i_data_sem. So at the time the unwritten extent + * could be converted. + */ + if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) + down_read((&EXT4_I(inode)->i_data_sem)); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + retval = ext4_ext_map_blocks(handle, inode, map, flags & + EXT4_GET_BLOCKS_KEEP_SIZE); + } else { + retval = ext4_ind_map_blocks(handle, inode, map, flags & + EXT4_GET_BLOCKS_KEEP_SIZE); + } + if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) + up_read((&EXT4_I(inode)->i_data_sem)); + /* + * Clear EXT4_MAP_FROM_CLUSTER and EXT4_MAP_BOUNDARY flag + * because it shouldn't be marked in es_map->m_flags. + */ + map->m_flags &= ~(EXT4_MAP_FROM_CLUSTER | EXT4_MAP_BOUNDARY); + + /* + * We don't check m_len because extent will be collpased in status + * tree. So the m_len might not equal. + */ + if (es_map->m_lblk != map->m_lblk || + es_map->m_flags != map->m_flags || + es_map->m_pblk != map->m_pblk) { + printk("ES cache assertation failed for inode: %lu " + "es_cached ex [%d/%d/%llu/%x] != " + "found ex [%d/%d/%llu/%x] retval %d flags %x\n", + inode->i_ino, es_map->m_lblk, es_map->m_len, + es_map->m_pblk, es_map->m_flags, map->m_lblk, + map->m_len, map->m_pblk, map->m_flags, + retval, flags); + } +} +#endif /* ES_AGGRESSIVE_TEST */ + /* * The ext4_map_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. @@ -509,6 +562,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, { struct extent_status es; int retval; +#ifdef ES_AGGRESSIVE_TEST + struct ext4_map_blocks orig_map; + + memcpy(&orig_map, map, sizeof(*map)); +#endif map->m_flags = 0; ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u," @@ -531,6 +589,10 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, } else { BUG_ON(1); } +#ifdef ES_AGGRESSIVE_TEST + ext4_map_blocks_es_recheck(handle, inode, map, + &orig_map, flags); +#endif goto found; } @@ -551,6 +613,15 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, int ret; unsigned long long status; +#ifdef ES_AGGRESSIVE_TEST + if (retval != map->m_len) { + printk("ES len assertation failed for inode: %lu " + "retval %d != map->m_len %d " + "in %s (lookup)\n", inode->i_ino, retval, + map->m_len, __func__); + } +#endif + status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && @@ -643,6 +714,24 @@ found: int ret; unsigned long long status; +#ifdef ES_AGGRESSIVE_TEST + if (retval != map->m_len) { + printk("ES len assertation failed for inode: %lu " + "retval %d != map->m_len %d " + "in %s (allocation)\n", inode->i_ino, retval, + map->m_len, __func__); + } +#endif + + /* + * If the extent has been zeroed out, we don't need to update + * extent status tree. + */ + if ((flags & EXT4_GET_BLOCKS_PRE_IO) && + ext4_es_lookup_extent(inode, map->m_lblk, &es)) { + if (ext4_es_is_written(&es)) + goto has_zeroout; + } status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && @@ -655,6 +744,7 @@ found: retval = ret; } +has_zeroout: up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { int ret = check_block_validity(inode, map); @@ -991,20 +1081,42 @@ retry_journal: /* For write_end() in data=journal mode */ static int write_end_fn(handle_t *handle, struct buffer_head *bh) { + int ret; if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; set_buffer_uptodate(bh); - return ext4_handle_dirty_metadata(handle, NULL, bh); + ret = ext4_handle_dirty_metadata(handle, NULL, bh); + clear_buffer_meta(bh); + clear_buffer_prio(bh); + return ret; } -static int ext4_generic_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) +/* + * We need to pick up the new inode size which generic_commit_write gave us + * `file' can be NULL - eg, when called from page_symlink(). + * + * ext4 never places buffers on inode->i_mapping->private_list. metadata + * buffers are managed internally. + */ +static int ext4_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { - int i_size_changed = 0; - struct inode *inode = mapping->host; handle_t *handle = ext4_journal_current_handle(); + struct inode *inode = mapping->host; + int ret = 0, ret2; + int i_size_changed = 0; + + trace_ext4_write_end(inode, pos, len, copied); + if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) { + ret = ext4_jbd2_file_inode(handle, inode); + if (ret) { + unlock_page(page); + page_cache_release(page); + goto errout; + } + } if (ext4_has_inline_data(inode)) copied = ext4_write_inline_data_end(inode, pos, len, @@ -1015,7 +1127,7 @@ static int ext4_generic_write_end(struct file *file, /* * No need to use i_size_read() here, the i_size - * cannot change under us because we hold i_mutex. + * cannot change under us because we hole i_mutex. * * But it's important to update i_size while still holding page lock: * page writeout could otherwise come in and zero beyond i_size. @@ -1025,10 +1137,10 @@ static int ext4_generic_write_end(struct file *file, i_size_changed = 1; } - if (pos + copied > EXT4_I(inode)->i_disksize) { + if (pos + copied > EXT4_I(inode)->i_disksize) { /* We need to mark inode dirty even if * new_i_size is less that inode->i_size - * bu greater than i_disksize.(hint delalloc) + * but greater than i_disksize. (hint delalloc) */ ext4_update_i_disksize(inode, (pos + copied)); i_size_changed = 1; @@ -1045,87 +1157,15 @@ static int ext4_generic_write_end(struct file *file, if (i_size_changed) ext4_mark_inode_dirty(handle, inode); - return copied; -} - -/* - * We need to pick up the new inode size which generic_commit_write gave us - * `file' can be NULL - eg, when called from page_symlink(). - * - * ext4 never places buffers on inode->i_mapping->private_list. metadata - * buffers are managed internally. - */ -static int ext4_ordered_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - handle_t *handle = ext4_journal_current_handle(); - struct inode *inode = mapping->host; - int ret = 0, ret2; - - trace_ext4_ordered_write_end(inode, pos, len, copied); - ret = ext4_jbd2_file_inode(handle, inode); - - if (ret == 0) { - ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, - page, fsdata); - copied = ret2; - if (pos + len > inode->i_size && ext4_can_truncate(inode)) - /* if we have allocated more blocks and copied - * less. We will have blocks allocated outside - * inode->i_size. So truncate them - */ - ext4_orphan_add(handle, inode); - if (ret2 < 0) - ret = ret2; - } else { - unlock_page(page); - page_cache_release(page); - } - - ret2 = ext4_journal_stop(handle); - if (!ret) - ret = ret2; - - if (pos + len > inode->i_size) { - ext4_truncate_failed_write(inode); - /* - * If truncate failed early the inode might still be - * on the orphan list; we need to make sure the inode - * is removed from the orphan list in that case. - */ - if (inode->i_nlink) - ext4_orphan_del(NULL, inode); - } - - - return ret ? ret : copied; -} - -static int ext4_writeback_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - handle_t *handle = ext4_journal_current_handle(); - struct inode *inode = mapping->host; - int ret = 0, ret2; - - trace_ext4_writeback_write_end(inode, pos, len, copied); - ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, - page, fsdata); - copied = ret2; + if (copied < 0) + ret = copied; if (pos + len > inode->i_size && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them */ ext4_orphan_add(handle, inode); - - if (ret2 < 0) - ret = ret2; - +errout: ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; @@ -1216,6 +1256,55 @@ static int ext4_journalled_write_end(struct file *file, } /* + * Reserve a metadata for a single block located at lblock + */ +static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock) +{ + int retries = 0; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned int md_needed; + ext4_lblk_t save_last_lblock; + int save_len; + + /* + * recalculate the amount of metadata blocks to reserve + * in order to allocate nrblocks + * worse case is one extent per block + */ +repeat: + spin_lock(&ei->i_block_reservation_lock); + /* + * ext4_calc_metadata_amount() has side effects, which we have + * to be prepared undo if we fail to claim space. + */ + save_len = ei->i_da_metadata_calc_len; + save_last_lblock = ei->i_da_metadata_calc_last_lblock; + md_needed = EXT4_NUM_B2C(sbi, + ext4_calc_metadata_amount(inode, lblock)); + trace_ext4_da_reserve_space(inode, md_needed); + + /* + * We do still charge estimated metadata to the sb though; + * we cannot afford to run out of free blocks. + */ + if (ext4_claim_free_clusters(sbi, md_needed, 0)) { + ei->i_da_metadata_calc_len = save_len; + ei->i_da_metadata_calc_last_lblock = save_last_lblock; + spin_unlock(&ei->i_block_reservation_lock); + if (ext4_should_retry_alloc(inode->i_sb, &retries)) { + cond_resched(); + goto repeat; + } + return -ENOSPC; + } + ei->i_reserved_meta_blocks += md_needed; + spin_unlock(&ei->i_block_reservation_lock); + + return 0; /* success */ +} + +/* * Reserve a single cluster located at lblock */ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) @@ -1263,7 +1352,7 @@ repeat: ei->i_da_metadata_calc_last_lblock = save_last_lblock; spin_unlock(&ei->i_block_reservation_lock); if (ext4_should_retry_alloc(inode->i_sb, &retries)) { - yield(); + cond_resched(); goto repeat; } dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); @@ -1399,7 +1488,10 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, struct ext4_io_submit io_submit; BUG_ON(mpd->next_page <= mpd->first_page); - memset(&io_submit, 0, sizeof(io_submit)); + ext4_io_submit_init(&io_submit, mpd->wbc); + io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); + if (!io_submit.io_end) + return -ENOMEM; /* * We need to start from the first_page to the next_page - 1 * to make sure we also write the mapped dirty buffer_heads. @@ -1487,6 +1579,8 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, pagevec_release(&pvec); } ext4_io_submit(&io_submit); + /* Drop io_end reference we got from init */ + ext4_put_io_end_defer(io_submit.io_end); return ret; } @@ -1531,22 +1625,25 @@ static void ext4_print_free_blocks(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct super_block *sb = inode->i_sb; + struct ext4_inode_info *ei = EXT4_I(inode); ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld", EXT4_C2B(EXT4_SB(inode->i_sb), - ext4_count_free_clusters(inode->i_sb))); + ext4_count_free_clusters(sb))); ext4_msg(sb, KERN_CRIT, "Free/Dirty block details"); ext4_msg(sb, KERN_CRIT, "free_blocks=%lld", - (long long) EXT4_C2B(EXT4_SB(inode->i_sb), + (long long) EXT4_C2B(EXT4_SB(sb), percpu_counter_sum(&sbi->s_freeclusters_counter))); ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld", - (long long) EXT4_C2B(EXT4_SB(inode->i_sb), + (long long) EXT4_C2B(EXT4_SB(sb), percpu_counter_sum(&sbi->s_dirtyclusters_counter))); ext4_msg(sb, KERN_CRIT, "Block reservation details"); ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", - EXT4_I(inode)->i_reserved_data_blocks); + ei->i_reserved_data_blocks); ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u", - EXT4_I(inode)->i_reserved_meta_blocks); + ei->i_reserved_meta_blocks); + ext4_msg(sb, KERN_CRIT, "i_allocated_meta_blocks=%u", + ei->i_allocated_meta_blocks); return; } @@ -1601,12 +1698,21 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd) */ map.m_lblk = next; map.m_len = max_blocks; - get_blocks_flags = EXT4_GET_BLOCKS_CREATE; + /* + * We're in delalloc path and it is possible that we're going to + * need more metadata blocks than previously reserved. However + * we must not fail because we're in writeback and there is + * nothing we can do about it so it might result in data loss. + * So use reserved blocks to allocate metadata if possible. + */ + get_blocks_flags = EXT4_GET_BLOCKS_CREATE | + EXT4_GET_BLOCKS_METADATA_NOFAIL; if (ext4_should_dioread_nolock(mpd->inode)) get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; if (mpd->b_state & (1 << BH_Delay)) get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; + blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags); if (blks < 0) { struct super_block *sb = mpd->inode->i_sb; @@ -1768,6 +1874,11 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, struct extent_status es; int retval; sector_t invalid_block = ~((sector_t) 0xffff); +#ifdef ES_AGGRESSIVE_TEST + struct ext4_map_blocks orig_map; + + memcpy(&orig_map, map, sizeof(*map)); +#endif if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) invalid_block = ~0; @@ -1809,6 +1920,9 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, else BUG_ON(1); +#ifdef ES_AGGRESSIVE_TEST + ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0); +#endif return retval; } @@ -1843,8 +1957,11 @@ add_delayed: * XXX: __block_prepare_write() unmaps passed block, * is it OK? */ - /* If the block was allocated from previously allocated cluster, - * then we dont need to reserve it again. */ + /* + * If the block was allocated from previously allocated cluster, + * then we don't need to reserve it again. However we still need + * to reserve metadata for every block we're going to write. + */ if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) { ret = ext4_da_reserve_space(inode, iblock); if (ret) { @@ -1852,6 +1969,13 @@ add_delayed: retval = ret; goto out_unlock; } + } else { + ret = ext4_da_reserve_metadata(inode, iblock); + if (ret) { + /* not enough space to reserve */ + retval = ret; + goto out_unlock; + } } ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, @@ -1873,6 +1997,15 @@ add_delayed: int ret; unsigned long long status; +#ifdef ES_AGGRESSIVE_TEST + if (retval != map->m_len) { + printk("ES len assertation failed for inode: %lu " + "retval %d != map->m_len %d " + "in %s (lookup)\n", inode->i_ino, retval, + map->m_len, __func__); + } +#endif + status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, @@ -2106,9 +2239,16 @@ static int ext4_writepage(struct page *page, */ return __ext4_journalled_writepage(page, len); - memset(&io_submit, 0, sizeof(io_submit)); + ext4_io_submit_init(&io_submit, wbc); + io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); + if (!io_submit.io_end) { + redirty_page_for_writepage(wbc, page); + return -ENOMEM; + } ret = ext4_bio_write_page(&io_submit, page, len, wbc); ext4_io_submit(&io_submit); + /* Drop io_end reference we got from init */ + ext4_put_io_end_defer(io_submit.io_end); return ret; } @@ -2495,7 +2635,7 @@ out_writepages: static int ext4_nonda_switch(struct super_block *sb) { - s64 free_blocks, dirty_blocks; + s64 free_clusters, dirty_clusters; struct ext4_sb_info *sbi = EXT4_SB(sb); /* @@ -2506,17 +2646,18 @@ static int ext4_nonda_switch(struct super_block *sb) * Delalloc need an accurate free block accounting. So switch * to non delalloc when we are near to error range. */ - free_blocks = EXT4_C2B(sbi, - percpu_counter_read_positive(&sbi->s_freeclusters_counter)); - dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); + free_clusters = + percpu_counter_read_positive(&sbi->s_freeclusters_counter); + dirty_clusters = + percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); /* * Start pushing delalloc when 1/2 of free blocks are dirty. */ - if (dirty_blocks && (free_blocks < 2 * dirty_blocks)) + if (dirty_clusters && (free_clusters < 2 * dirty_clusters)) try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); - if (2 * free_blocks < 3 * dirty_blocks || - free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) { + if (2 * free_clusters < 3 * dirty_clusters || + free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) { /* * free block count is less than 150% of dirty blocks * or free blocks is less than watermark @@ -2652,18 +2793,9 @@ static int ext4_da_write_end(struct file *file, unsigned long start, end; int write_mode = (int)(unsigned long)fsdata; - if (write_mode == FALL_BACK_TO_NONDELALLOC) { - switch (ext4_inode_journal_mode(inode)) { - case EXT4_INODE_ORDERED_DATA_MODE: - return ext4_ordered_write_end(file, mapping, pos, - len, copied, page, fsdata); - case EXT4_INODE_WRITEBACK_DATA_MODE: - return ext4_writeback_write_end(file, mapping, pos, - len, copied, page, fsdata); - default: - BUG(); - } - } + if (write_mode == FALL_BACK_TO_NONDELALLOC) + return ext4_write_end(file, mapping, pos, + len, copied, page, fsdata); trace_ext4_da_write_end(inode, pos, len, copied); start = pos & (PAGE_CACHE_SIZE - 1); @@ -2908,8 +3040,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait) trace_ext4_releasepage(page); - WARN_ON(PageChecked(page)); - if (!page_has_buffers(page)) + /* Page has dirty journalled data -> cannot release */ + if (PageChecked(page)) return 0; if (journal) return jbd2_journal_try_to_free_buffers(journal, page, wait); @@ -2947,9 +3079,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, struct inode *inode = file_inode(iocb->ki_filp); ext4_io_end_t *io_end = iocb->private; - /* if not async direct IO or dio with 0 bytes write, just return */ - if (!io_end || !size) - goto out; + /* if not async direct IO just return */ + if (!io_end) { + inode_dio_done(inode); + if (is_async) + aio_complete(iocb, ret, 0); + return; + } ext_debug("ext4_end_io_dio(): io_end 0x%p " "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", @@ -2957,25 +3093,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, size); iocb->private = NULL; - - /* if not aio dio with unwritten extents, just free io and return */ - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { - ext4_free_io_end(io_end); -out: - inode_dio_done(inode); - if (is_async) - aio_complete(iocb, ret, 0); - return; - } - io_end->offset = offset; io_end->size = size; if (is_async) { io_end->iocb = iocb; io_end->result = ret; } - - ext4_add_complete_io(io_end); + ext4_put_io_end_defer(io_end); } /* @@ -3009,6 +3133,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, get_block_t *get_block_func = NULL; int dio_flags = 0; loff_t final_size = offset + count; + ext4_io_end_t *io_end = NULL; /* Use the old path for reads and writes beyond i_size. */ if (rw != WRITE || final_size > inode->i_size) @@ -3047,13 +3172,16 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, iocb->private = NULL; ext4_inode_aio_set(inode, NULL); if (!is_sync_kiocb(iocb)) { - ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS); + io_end = ext4_init_io_end(inode, GFP_NOFS); if (!io_end) { ret = -ENOMEM; goto retake_lock; } io_end->flag |= EXT4_IO_END_DIRECT; - iocb->private = io_end; + /* + * Grab reference for DIO. Will be dropped in ext4_end_io_dio() + */ + iocb->private = ext4_get_io_end(io_end); /* * we save the io structure for current async direct * IO, so that later ext4_map_blocks() could flag the @@ -3077,26 +3205,27 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, NULL, dio_flags); - if (iocb->private) - ext4_inode_aio_set(inode, NULL); /* - * The io_end structure takes a reference to the inode, that - * structure needs to be destroyed and the reference to the - * inode need to be dropped, when IO is complete, even with 0 - * byte write, or failed. - * - * In the successful AIO DIO case, the io_end structure will - * be destroyed and the reference to the inode will be dropped - * after the end_io call back function is called. - * - * In the case there is 0 byte write, or error case, since VFS - * direct IO won't invoke the end_io call back function, we - * need to free the end_io structure here. + * Put our reference to io_end. This can free the io_end structure e.g. + * in sync IO case or in case of error. It can even perform extent + * conversion if all bios we submitted finished before we got here. + * Note that in that case iocb->private can be already set to NULL + * here. */ - if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { - ext4_free_io_end(iocb->private); - iocb->private = NULL; - } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode, + if (io_end) { + ext4_inode_aio_set(inode, NULL); + ext4_put_io_end(io_end); + /* + * In case of error or no write ext4_end_io_dio() was not + * called so we have to put iocb's reference. + */ + if (ret <= 0 && ret != -EIOCBQUEUED) { + WARN_ON(iocb->private != io_end); + ext4_put_io_end(io_end); + iocb->private = NULL; + } + } + if (ret > 0 && !overwrite && ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN)) { int err; /* @@ -3168,27 +3297,12 @@ static int ext4_journalled_set_page_dirty(struct page *page) return __set_page_dirty_nobuffers(page); } -static const struct address_space_operations ext4_ordered_aops = { +static const struct address_space_operations ext4_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, .writepage = ext4_writepage, .write_begin = ext4_write_begin, - .write_end = ext4_ordered_write_end, - .bmap = ext4_bmap, - .invalidatepage = ext4_invalidatepage, - .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, - .migratepage = buffer_migrate_page, - .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, -}; - -static const struct address_space_operations ext4_writeback_aops = { - .readpage = ext4_readpage, - .readpages = ext4_readpages, - .writepage = ext4_writepage, - .write_begin = ext4_write_begin, - .write_end = ext4_writeback_write_end, + .write_end = ext4_write_end, .bmap = ext4_bmap, .invalidatepage = ext4_invalidatepage, .releasepage = ext4_releasepage, @@ -3233,23 +3347,21 @@ void ext4_set_aops(struct inode *inode) { switch (ext4_inode_journal_mode(inode)) { case EXT4_INODE_ORDERED_DATA_MODE: - if (test_opt(inode->i_sb, DELALLOC)) - inode->i_mapping->a_ops = &ext4_da_aops; - else - inode->i_mapping->a_ops = &ext4_ordered_aops; + ext4_set_inode_state(inode, EXT4_STATE_ORDERED_MODE); break; case EXT4_INODE_WRITEBACK_DATA_MODE: - if (test_opt(inode->i_sb, DELALLOC)) - inode->i_mapping->a_ops = &ext4_da_aops; - else - inode->i_mapping->a_ops = &ext4_writeback_aops; + ext4_clear_inode_state(inode, EXT4_STATE_ORDERED_MODE); break; case EXT4_INODE_JOURNAL_DATA_MODE: inode->i_mapping->a_ops = &ext4_journalled_aops; - break; + return; default: BUG(); } + if (test_opt(inode->i_sb, DELALLOC)) + inode->i_mapping->a_ops = &ext4_da_aops; + else + inode->i_mapping->a_ops = &ext4_aops; } @@ -3480,20 +3592,190 @@ int ext4_can_truncate(struct inode *inode) int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) { struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + ext4_lblk_t first_block, stop_block; + struct address_space *mapping = inode->i_mapping; + loff_t first_page, last_page, page_len; + loff_t first_page_offset, last_page_offset; + handle_t *handle; + unsigned int credits; + int ret = 0; + if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; - if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - return ext4_ind_punch_hole(file, offset, length); - - if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) { + if (EXT4_SB(sb)->s_cluster_ratio > 1) { /* TODO: Add support for bigalloc file systems */ return -EOPNOTSUPP; } trace_ext4_punch_hole(inode, offset, length); - return ext4_ext_punch_hole(file, offset, length); + /* + * Write out all dirty pages to avoid race conditions + * Then release them. + */ + if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { + ret = filemap_write_and_wait_range(mapping, offset, + offset + length - 1); + if (ret) + return ret; + } + + mutex_lock(&inode->i_mutex); + /* It's not possible punch hole on append only file */ + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { + ret = -EPERM; + goto out_mutex; + } + if (IS_SWAPFILE(inode)) { + ret = -ETXTBSY; + goto out_mutex; + } + + /* No need to punch hole beyond i_size */ + if (offset >= inode->i_size) + goto out_mutex; + + /* + * If the hole extends beyond i_size, set the hole + * to end after the page that contains i_size + */ + if (offset + length > inode->i_size) { + length = inode->i_size + + PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) - + offset; + } + + first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + last_page = (offset + length) >> PAGE_CACHE_SHIFT; + + first_page_offset = first_page << PAGE_CACHE_SHIFT; + last_page_offset = last_page << PAGE_CACHE_SHIFT; + + /* Now release the pages */ + if (last_page_offset > first_page_offset) { + truncate_pagecache_range(inode, first_page_offset, + last_page_offset - 1); + } + + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + ret = ext4_flush_unwritten_io(inode); + if (ret) + goto out_dio; + inode_dio_wait(inode); + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + credits = ext4_writepage_trans_blocks(inode); + else + credits = ext4_blocks_for_truncate(inode); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + ext4_std_error(sb, ret); + goto out_dio; + } + + /* + * Now we need to zero out the non-page-aligned data in the + * pages at the start and tail of the hole, and unmap the + * buffer heads for the block aligned regions of the page that + * were completely zeroed. + */ + if (first_page > last_page) { + /* + * If the file space being truncated is contained + * within a page just zero out and unmap the middle of + * that page + */ + ret = ext4_discard_partial_page_buffers(handle, + mapping, offset, length, 0); + + if (ret) + goto out_stop; + } else { + /* + * zero out and unmap the partial page that contains + * the start of the hole + */ + page_len = first_page_offset - offset; + if (page_len > 0) { + ret = ext4_discard_partial_page_buffers(handle, mapping, + offset, page_len, 0); + if (ret) + goto out_stop; + } + + /* + * zero out and unmap the partial page that contains + * the end of the hole + */ + page_len = offset + length - last_page_offset; + if (page_len > 0) { + ret = ext4_discard_partial_page_buffers(handle, mapping, + last_page_offset, page_len, 0); + if (ret) + goto out_stop; + } + } + + /* + * If i_size is contained in the last page, we need to + * unmap and zero the partial page after i_size + */ + if (inode->i_size >> PAGE_CACHE_SHIFT == last_page && + inode->i_size % PAGE_CACHE_SIZE != 0) { + page_len = PAGE_CACHE_SIZE - + (inode->i_size & (PAGE_CACHE_SIZE - 1)); + + if (page_len > 0) { + ret = ext4_discard_partial_page_buffers(handle, + mapping, inode->i_size, page_len, 0); + + if (ret) + goto out_stop; + } + } + + first_block = (offset + sb->s_blocksize - 1) >> + EXT4_BLOCK_SIZE_BITS(sb); + stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); + + /* If there are no blocks to remove, return now */ + if (first_block >= stop_block) + goto out_stop; + + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + + ret = ext4_es_remove_extent(inode, first_block, + stop_block - first_block); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + ret = ext4_ext_remove_space(inode, first_block, + stop_block - 1); + else + ret = ext4_free_hole_blocks(handle, inode, first_block, + stop_block); + + ext4_discard_preallocations(inode); + up_write(&EXT4_I(inode)->i_data_sem); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); +out_stop: + ext4_journal_stop(handle); +out_dio: + ext4_inode_resume_unlocked_dio(inode); +out_mutex: + mutex_unlock(&inode->i_mutex); + return ret; } /* @@ -3526,6 +3808,19 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) */ void ext4_truncate(struct inode *inode) { + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned int credits; + handle_t *handle; + struct address_space *mapping = inode->i_mapping; + loff_t page_len; + + /* + * There is a possibility that we're either freeing the inode + * or it completely new indode. In those cases we might not + * have i_mutex locked because it's not necessary. + */ + if (!(inode->i_state & (I_NEW|I_FREEING))) + WARN_ON(!mutex_is_locked(&inode->i_mutex)); trace_ext4_truncate_enter(inode); if (!ext4_can_truncate(inode)) @@ -3544,10 +3839,72 @@ void ext4_truncate(struct inode *inode) return; } + /* + * finish any pending end_io work so we won't run the risk of + * converting any truncated blocks to initialized later + */ + ext4_flush_unwritten_io(inode); + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + credits = ext4_writepage_trans_blocks(inode); + else + credits = ext4_blocks_for_truncate(inode); + + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); + if (IS_ERR(handle)) { + ext4_std_error(inode->i_sb, PTR_ERR(handle)); + return; + } + + if (inode->i_size % PAGE_CACHE_SIZE != 0) { + page_len = PAGE_CACHE_SIZE - + (inode->i_size & (PAGE_CACHE_SIZE - 1)); + + if (ext4_discard_partial_page_buffers(handle, + mapping, inode->i_size, page_len, 0)) + goto out_stop; + } + + /* + * We add the inode to the orphan list, so that if this + * truncate spans multiple transactions, and we crash, we will + * resume the truncate when the filesystem recovers. It also + * marks the inode dirty, to catch the new size. + * + * Implication: the file must always be in a sane, consistent + * truncatable state while each transaction commits. + */ + if (ext4_orphan_add(handle, inode)) + goto out_stop; + + down_write(&EXT4_I(inode)->i_data_sem); + + ext4_discard_preallocations(inode); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - ext4_ext_truncate(inode); + ext4_ext_truncate(handle, inode); else - ext4_ind_truncate(inode); + ext4_ind_truncate(handle, inode); + + up_write(&ei->i_data_sem); + + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + +out_stop: + /* + * If this was a simple ftruncate() and the file will remain alive, + * then we need to clear up the orphan record which we created above. + * However, if this was a real unlink then we were called by + * ext4_delete_inode(), and we allow that function to clean up the + * orphan info for us. + */ + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); trace_ext4_truncate_exit(inode); } @@ -3655,13 +4012,14 @@ make_io: if (EXT4_SB(sb)->s_inode_readahead_blks) { ext4_fsblk_t b, end, table; unsigned num; + __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks; table = ext4_inode_table(sb, gdp); /* s_inode_readahead_blks is always a power of 2 */ - b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); + b = block & ~((ext4_fsblk_t) ra_blks - 1); if (table > b) b = table; - end = b + EXT4_SB(sb)->s_inode_readahead_blks; + end = b + ra_blks; num = EXT4_INODES_PER_GROUP(sb); if (ext4_has_group_desc_csum(sb)) num -= ext4_itable_unused_count(sb, gdp); @@ -3858,8 +4216,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) * NeilBrown 1999oct15 */ if (inode->i_nlink == 0) { - if (inode->i_mode == 0 || - !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { + if ((inode->i_mode == 0 || + !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) && + ino != EXT4_BOOT_LOADER_INO) { /* this inode is deleted */ ret = -ESTALE; goto bad_inode; @@ -3867,7 +4226,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) /* The only unlinked inodes we let through here have * valid i_mode and are being read by the orphan * recovery code: that's fine, we're about to complete - * the process of deleting those. */ + * the process of deleting those. + * OR it is the EXT4_BOOT_LOADER_INO which is + * not initialized on a new filesystem. */ } ei->i_flags = le32_to_cpu(raw_inode->i_flags); inode->i_blocks = ext4_inode_blocks(raw_inode, ei); @@ -3987,6 +4348,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) else init_special_inode(inode, inode->i_mode, new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); + } else if (ino == EXT4_BOOT_LOADER_INO) { + make_bad_inode(inode); } else { ret = -EIO; EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 721f4d33e148..9491ac0590f7 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -17,9 +17,201 @@ #include <asm/uaccess.h> #include "ext4_jbd2.h" #include "ext4.h" +#include "ext4_extents.h" #define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1) +/** + * Swap memory between @a and @b for @len bytes. + * + * @a: pointer to first memory area + * @b: pointer to second memory area + * @len: number of bytes to swap + * + */ +static void memswap(void *a, void *b, size_t len) +{ + unsigned char *ap, *bp; + unsigned char tmp; + + ap = (unsigned char *)a; + bp = (unsigned char *)b; + while (len-- > 0) { + tmp = *ap; + *ap = *bp; + *bp = tmp; + ap++; + bp++; + } +} + +/** + * Swap i_data and associated attributes between @inode1 and @inode2. + * This function is used for the primary swap between inode1 and inode2 + * and also to revert this primary swap in case of errors. + * + * Therefore you have to make sure, that calling this method twice + * will revert all changes. + * + * @inode1: pointer to first inode + * @inode2: pointer to second inode + */ +static void swap_inode_data(struct inode *inode1, struct inode *inode2) +{ + loff_t isize; + struct ext4_inode_info *ei1; + struct ext4_inode_info *ei2; + + ei1 = EXT4_I(inode1); + ei2 = EXT4_I(inode2); + + memswap(&inode1->i_flags, &inode2->i_flags, sizeof(inode1->i_flags)); + memswap(&inode1->i_version, &inode2->i_version, + sizeof(inode1->i_version)); + memswap(&inode1->i_blocks, &inode2->i_blocks, + sizeof(inode1->i_blocks)); + memswap(&inode1->i_bytes, &inode2->i_bytes, sizeof(inode1->i_bytes)); + memswap(&inode1->i_atime, &inode2->i_atime, sizeof(inode1->i_atime)); + memswap(&inode1->i_mtime, &inode2->i_mtime, sizeof(inode1->i_mtime)); + + memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data)); + memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags)); + memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize)); + memswap(&ei1->i_es_tree, &ei2->i_es_tree, sizeof(ei1->i_es_tree)); + memswap(&ei1->i_es_lru_nr, &ei2->i_es_lru_nr, sizeof(ei1->i_es_lru_nr)); + + isize = i_size_read(inode1); + i_size_write(inode1, i_size_read(inode2)); + i_size_write(inode2, isize); +} + +/** + * Swap the information from the given @inode and the inode + * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other + * important fields of the inodes. + * + * @sb: the super block of the filesystem + * @inode: the inode to swap with EXT4_BOOT_LOADER_INO + * + */ +static long swap_inode_boot_loader(struct super_block *sb, + struct inode *inode) +{ + handle_t *handle; + int err; + struct inode *inode_bl; + struct ext4_inode_info *ei; + struct ext4_inode_info *ei_bl; + struct ext4_sb_info *sbi; + + if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) { + err = -EINVAL; + goto swap_boot_out; + } + + if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto swap_boot_out; + } + + sbi = EXT4_SB(sb); + ei = EXT4_I(inode); + + inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO); + if (IS_ERR(inode_bl)) { + err = PTR_ERR(inode_bl); + goto swap_boot_out; + } + ei_bl = EXT4_I(inode_bl); + + filemap_flush(inode->i_mapping); + filemap_flush(inode_bl->i_mapping); + + /* Protect orig inodes against a truncate and make sure, + * that only 1 swap_inode_boot_loader is running. */ + ext4_inode_double_lock(inode, inode_bl); + + truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages(&inode_bl->i_data, 0); + + /* Wait for all existing dio workers */ + ext4_inode_block_unlocked_dio(inode); + ext4_inode_block_unlocked_dio(inode_bl); + inode_dio_wait(inode); + inode_dio_wait(inode_bl); + + handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2); + if (IS_ERR(handle)) { + err = -EINVAL; + goto swap_boot_out; + } + + /* Protect extent tree against block allocations via delalloc */ + ext4_double_down_write_data_sem(inode, inode_bl); + + if (inode_bl->i_nlink == 0) { + /* this inode has never been used as a BOOT_LOADER */ + set_nlink(inode_bl, 1); + i_uid_write(inode_bl, 0); + i_gid_write(inode_bl, 0); + inode_bl->i_flags = 0; + ei_bl->i_flags = 0; + inode_bl->i_version = 1; + i_size_write(inode_bl, 0); + inode_bl->i_mode = S_IFREG; + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_EXTENTS)) { + ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS); + ext4_ext_tree_init(handle, inode_bl); + } else + memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data)); + } + + swap_inode_data(inode, inode_bl); + + inode->i_ctime = inode_bl->i_ctime = ext4_current_time(inode); + + spin_lock(&sbi->s_next_gen_lock); + inode->i_generation = sbi->s_next_generation++; + inode_bl->i_generation = sbi->s_next_generation++; + spin_unlock(&sbi->s_next_gen_lock); + + ext4_discard_preallocations(inode); + + err = ext4_mark_inode_dirty(handle, inode); + if (err < 0) { + ext4_warning(inode->i_sb, + "couldn't mark inode #%lu dirty (err %d)", + inode->i_ino, err); + /* Revert all changes: */ + swap_inode_data(inode, inode_bl); + } else { + err = ext4_mark_inode_dirty(handle, inode_bl); + if (err < 0) { + ext4_warning(inode_bl->i_sb, + "couldn't mark inode #%lu dirty (err %d)", + inode_bl->i_ino, err); + /* Revert all changes: */ + swap_inode_data(inode, inode_bl); + ext4_mark_inode_dirty(handle, inode); + } + } + + ext4_journal_stop(handle); + + ext4_double_up_write_data_sem(inode, inode_bl); + + ext4_inode_resume_unlocked_dio(inode); + ext4_inode_resume_unlocked_dio(inode_bl); + + ext4_inode_double_unlock(inode, inode_bl); + + iput(inode_bl); + +swap_boot_out: + return err; +} + long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -83,17 +275,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (!capable(CAP_SYS_RESOURCE)) goto flags_out; } - if (oldflags & EXT4_EXTENTS_FL) { - /* We don't support clearning extent flags */ - if (!(flags & EXT4_EXTENTS_FL)) { - err = -EOPNOTSUPP; - goto flags_out; - } - } else if (flags & EXT4_EXTENTS_FL) { - /* migrate the file */ + if ((flags ^ oldflags) & EXT4_EXTENTS_FL) migrate = 1; - flags &= ~EXT4_EXTENTS_FL; - } if (flags & EXT4_EOFBLOCKS_FL) { /* we don't support adding EOFBLOCKS flag */ @@ -137,8 +320,13 @@ flags_err: err = ext4_change_inode_journal_flag(inode, jflag); if (err) goto flags_out; - if (migrate) - err = ext4_ext_migrate(inode); + if (migrate) { + if (flags & EXT4_EXTENTS_FL) + err = ext4_ext_migrate(inode); + else + err = ext4_ind_migrate(inode); + } + flags_out: mutex_unlock(&inode->i_mutex); mnt_drop_write_file(filp); @@ -357,9 +545,13 @@ group_add_out: return err; } + case EXT4_IOC_SWAP_BOOT: + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + return swap_inode_boot_loader(sb, inode); + case EXT4_IOC_RESIZE_FS: { ext4_fsblk_t n_blocks_count; - struct super_block *sb = inode->i_sb; int err = 0, err2 = 0; ext4_group_t o_group = EXT4_SB(sb)->s_groups_count; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 7bb713a46fe4..b1ed9e07434b 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -405,6 +405,12 @@ static inline void mb_clear_bit(int bit, void *addr) ext4_clear_bit(bit, addr); } +static inline int mb_test_and_clear_bit(int bit, void *addr) +{ + addr = mb_correct_addr_and_bit(&bit, addr); + return ext4_test_and_clear_bit(bit, addr); +} + static inline int mb_find_next_zero_bit(void *addr, int max, int start) { int fix = 0, ret, tmpmax; @@ -764,6 +770,24 @@ void ext4_mb_generate_buddy(struct super_block *sb, spin_unlock(&EXT4_SB(sb)->s_bal_lock); } +static void mb_regenerate_buddy(struct ext4_buddy *e4b) +{ + int count; + int order = 1; + void *buddy; + + while ((buddy = mb_find_buddy(e4b, order++, &count))) { + ext4_set_bits(buddy, 0, count); + } + e4b->bd_info->bb_fragments = 0; + memset(e4b->bd_info->bb_counters, 0, + sizeof(*e4b->bd_info->bb_counters) * + (e4b->bd_sb->s_blocksize_bits + 2)); + + ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy, + e4b->bd_bitmap, e4b->bd_group); +} + /* The buddy information is attached the buddy cache inode * for convenience. The information regarding each group * is loaded via ext4_mb_load_buddy. The information involve @@ -860,8 +884,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore) first_block = page->index * blocks_per_page; for (i = 0; i < blocks_per_page; i++) { - int group; - group = (first_block + i) >> 1; if (group >= ngroups) break; @@ -1011,6 +1033,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) struct page *page; int ret = 0; + might_sleep(); mb_debug(1, "init group %u\n", group); this_grp = ext4_get_group_info(sb, group); /* @@ -1082,6 +1105,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, struct ext4_sb_info *sbi = EXT4_SB(sb); struct inode *inode = sbi->s_buddy_cache; + might_sleep(); mb_debug(1, "load group %u\n", group); blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; @@ -1244,6 +1268,33 @@ static void mb_clear_bits(void *bm, int cur, int len) } } +/* clear bits in given range + * will return first found zero bit if any, -1 otherwise + */ +static int mb_test_and_clear_bits(void *bm, int cur, int len) +{ + __u32 *addr; + int zero_bit = -1; + + len = cur + len; + while (cur < len) { + if ((cur & 31) == 0 && (len - cur) >= 32) { + /* fast path: clear whole word at once */ + addr = bm + (cur >> 3); + if (*addr != (__u32)(-1) && zero_bit == -1) + zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0); + *addr = 0; + cur += 32; + continue; + } + if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1) + zero_bit = cur; + cur++; + } + + return zero_bit; +} + void ext4_set_bits(void *bm, int cur, int len) { __u32 *addr; @@ -1262,17 +1313,90 @@ void ext4_set_bits(void *bm, int cur, int len) } } +/* + * _________________________________________________________________ */ + +static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side) +{ + if (mb_test_bit(*bit + side, bitmap)) { + mb_clear_bit(*bit, bitmap); + (*bit) -= side; + return 1; + } + else { + (*bit) += side; + mb_set_bit(*bit, bitmap); + return -1; + } +} + +static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last) +{ + int max; + int order = 1; + void *buddy = mb_find_buddy(e4b, order, &max); + + while (buddy) { + void *buddy2; + + /* Bits in range [first; last] are known to be set since + * corresponding blocks were allocated. Bits in range + * (first; last) will stay set because they form buddies on + * upper layer. We just deal with borders if they don't + * align with upper layer and then go up. + * Releasing entire group is all about clearing + * single bit of highest order buddy. + */ + + /* Example: + * --------------------------------- + * | 1 | 1 | 1 | 1 | + * --------------------------------- + * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | + * --------------------------------- + * 0 1 2 3 4 5 6 7 + * \_____________________/ + * + * Neither [1] nor [6] is aligned to above layer. + * Left neighbour [0] is free, so mark it busy, + * decrease bb_counters and extend range to + * [0; 6] + * Right neighbour [7] is busy. It can't be coaleasced with [6], so + * mark [6] free, increase bb_counters and shrink range to + * [0; 5]. + * Then shift range to [0; 2], go up and do the same. + */ + + + if (first & 1) + e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1); + if (!(last & 1)) + e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1); + if (first > last) + break; + order++; + + if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) { + mb_clear_bits(buddy, first, last - first + 1); + e4b->bd_info->bb_counters[order - 1] += last - first + 1; + break; + } + first >>= 1; + last >>= 1; + buddy = buddy2; + } +} + static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, - int first, int count) + int first, int count) { - int block = 0; - int max = 0; - int order; - void *buddy; - void *buddy2; + int left_is_free = 0; + int right_is_free = 0; + int block; + int last = first + count - 1; struct super_block *sb = e4b->bd_sb; - BUG_ON(first + count > (sb->s_blocksize << 3)); + BUG_ON(last >= (sb->s_blocksize << 3)); assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); mb_check_buddy(e4b); mb_free_blocks_double(inode, e4b, first, count); @@ -1281,67 +1405,54 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, if (first < e4b->bd_info->bb_first_free) e4b->bd_info->bb_first_free = first; - /* let's maintain fragments counter */ + /* access memory sequentially: check left neighbour, + * clear range and then check right neighbour + */ if (first != 0) - block = !mb_test_bit(first - 1, e4b->bd_bitmap); - if (first + count < EXT4_SB(sb)->s_mb_maxs[0]) - max = !mb_test_bit(first + count, e4b->bd_bitmap); - if (block && max) - e4b->bd_info->bb_fragments--; - else if (!block && !max) - e4b->bd_info->bb_fragments++; - - /* let's maintain buddy itself */ - while (count-- > 0) { - block = first++; - order = 0; + left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap); + block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count); + if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0]) + right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap); - if (!mb_test_bit(block, e4b->bd_bitmap)) { - ext4_fsblk_t blocknr; + if (unlikely(block != -1)) { + ext4_fsblk_t blocknr; - blocknr = ext4_group_first_block_no(sb, e4b->bd_group); - blocknr += EXT4_C2B(EXT4_SB(sb), block); - ext4_grp_locked_error(sb, e4b->bd_group, - inode ? inode->i_ino : 0, - blocknr, - "freeing already freed block " - "(bit %u)", block); - } - mb_clear_bit(block, e4b->bd_bitmap); - e4b->bd_info->bb_counters[order]++; - - /* start of the buddy */ - buddy = mb_find_buddy(e4b, order, &max); - - do { - block &= ~1UL; - if (mb_test_bit(block, buddy) || - mb_test_bit(block + 1, buddy)) - break; - - /* both the buddies are free, try to coalesce them */ - buddy2 = mb_find_buddy(e4b, order + 1, &max); + blocknr = ext4_group_first_block_no(sb, e4b->bd_group); + blocknr += EXT4_C2B(EXT4_SB(sb), block); + ext4_grp_locked_error(sb, e4b->bd_group, + inode ? inode->i_ino : 0, + blocknr, + "freeing already freed block " + "(bit %u)", block); + mb_regenerate_buddy(e4b); + goto done; + } - if (!buddy2) - break; + /* let's maintain fragments counter */ + if (left_is_free && right_is_free) + e4b->bd_info->bb_fragments--; + else if (!left_is_free && !right_is_free) + e4b->bd_info->bb_fragments++; - if (order > 0) { - /* for special purposes, we don't set - * free bits in bitmap */ - mb_set_bit(block, buddy); - mb_set_bit(block + 1, buddy); - } - e4b->bd_info->bb_counters[order]--; - e4b->bd_info->bb_counters[order]--; + /* buddy[0] == bd_bitmap is a special case, so handle + * it right away and let mb_buddy_mark_free stay free of + * zero order checks. + * Check if neighbours are to be coaleasced, + * adjust bitmap bb_counters and borders appropriately. + */ + if (first & 1) { + first += !left_is_free; + e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1; + } + if (!(last & 1)) { + last -= !right_is_free; + e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1; + } - block = block >> 1; - order++; - e4b->bd_info->bb_counters[order]++; + if (first <= last) + mb_buddy_mark_free(e4b, first >> 1, last >> 1); - mb_clear_bit(block, buddy2); - buddy = buddy2; - } while (1); - } +done: mb_set_largest_free_order(sb, e4b->bd_info); mb_check_buddy(e4b); } @@ -2149,7 +2260,7 @@ static const struct seq_operations ext4_mb_seq_groups_ops = { static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) { - struct super_block *sb = PDE(inode)->data; + struct super_block *sb = PDE_DATA(inode); int rc; rc = seq_open(file, &ext4_mb_seq_groups_ops); @@ -2804,8 +2915,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, ac->ac_b_ex.fe_group); - atomic_sub(ac->ac_b_ex.fe_len, - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_sub(ac->ac_b_ex.fe_len, + &sbi->s_flex_groups[flex_group].free_clusters); } err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); @@ -3342,7 +3453,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, if (pa->pa_type == MB_GROUP_PA) grp_blk--; - ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL); + grp = ext4_get_group_number(sb, grp_blk); /* * possible race: @@ -3692,11 +3803,7 @@ repeat: if (free < needed && busy) { busy = 0; ext4_unlock_group(sb, group); - /* - * Yield the CPU here so that we don't get soft lockup - * in non preempt case. - */ - yield(); + cond_resched(); goto repeat; } @@ -3811,7 +3918,7 @@ repeat: list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { BUG_ON(pa->pa_type != MB_INODE_PA); - ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); + group = ext4_get_group_number(sb, pa->pa_pstart); err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { @@ -4073,7 +4180,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { - ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); + group = ext4_get_group_number(sb, pa->pa_pstart); if (ext4_mb_load_buddy(sb, group, &e4b)) { ext4_error(sb, "Error loading buddy information for %u", group); @@ -4221,6 +4328,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, unsigned int inquota = 0; unsigned int reserv_clstrs = 0; + might_sleep(); sb = ar->inode->i_sb; sbi = EXT4_SB(sb); @@ -4246,7 +4354,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, ext4_claim_free_clusters(sbi, ar->len, ar->flags)) { /* let others to free the space */ - yield(); + cond_resched(); ar->len = ar->len >> 1; } if (!ar->len) { @@ -4424,11 +4532,11 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, node = rb_prev(new_node); if (node) { entry = rb_entry(node, struct ext4_free_data, efd_node); - if (can_merge(entry, new_entry)) { + if (can_merge(entry, new_entry) && + ext4_journal_callback_try_del(handle, &entry->efd_jce)) { new_entry->efd_start_cluster = entry->efd_start_cluster; new_entry->efd_count += entry->efd_count; rb_erase(node, &(db->bb_free_root)); - ext4_journal_callback_del(handle, &entry->efd_jce); kmem_cache_free(ext4_free_data_cachep, entry); } } @@ -4436,10 +4544,10 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, node = rb_next(new_node); if (node) { entry = rb_entry(node, struct ext4_free_data, efd_node); - if (can_merge(new_entry, entry)) { + if (can_merge(new_entry, entry) && + ext4_journal_callback_try_del(handle, &entry->efd_jce)) { new_entry->efd_count += entry->efd_count; rb_erase(node, &(db->bb_free_root)); - ext4_journal_callback_del(handle, &entry->efd_jce); kmem_cache_free(ext4_free_data_cachep, entry); } } @@ -4464,7 +4572,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bitmap_bh = NULL; struct super_block *sb = inode->i_sb; struct ext4_group_desc *gdp; - unsigned long freed = 0; unsigned int overflow; ext4_grpblk_t bit; struct buffer_head *gd_bh; @@ -4475,6 +4582,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, int err = 0; int ret; + might_sleep(); if (bh) { if (block) BUG_ON(block != bh->b_blocknr); @@ -4666,14 +4774,12 @@ do_more: if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic_add(count_clusters, - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_add(count_clusters, + &sbi->s_flex_groups[flex_group].free_clusters); } ext4_mb_unload_buddy(&e4b); - freed += count; - if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); @@ -4811,8 +4917,8 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic_add(EXT4_NUM_B2C(sbi, blocks_freed), - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed), + &sbi->s_flex_groups[flex_group].free_clusters); } ext4_mb_unload_buddy(&e4b); diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 480acf4a085f..49e8bdff9163 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -426,7 +426,6 @@ static int free_ext_block(handle_t *handle, struct inode *inode) return retval; } return retval; - } int ext4_ext_migrate(struct inode *inode) @@ -606,3 +605,64 @@ out: return retval; } + +/* + * Migrate a simple extent-based inode to use the i_blocks[] array + */ +int ext4_ind_migrate(struct inode *inode) +{ + struct ext4_extent_header *eh; + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_extent *ex; + unsigned int i, len; + ext4_fsblk_t blk; + handle_t *handle; + int ret; + + if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_INCOMPAT_EXTENTS) || + (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return -EINVAL; + + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) + return -EOPNOTSUPP; + + handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + down_write(&EXT4_I(inode)->i_data_sem); + ret = ext4_ext_check_inode(inode); + if (ret) + goto errout; + + eh = ext_inode_hdr(inode); + ex = EXT_FIRST_EXTENT(eh); + if (ext4_blocks_count(es) > EXT4_MAX_BLOCK_FILE_PHYS || + eh->eh_depth != 0 || le16_to_cpu(eh->eh_entries) > 1) { + ret = -EOPNOTSUPP; + goto errout; + } + if (eh->eh_entries == 0) + blk = len = 0; + else { + len = le16_to_cpu(ex->ee_len); + blk = ext4_ext_pblock(ex); + if (len > EXT4_NDIR_BLOCKS) { + ret = -EOPNOTSUPP; + goto errout; + } + } + + ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); + memset(ei->i_data, 0, sizeof(ei->i_data)); + for (i=0; i < len; i++) + ei->i_data[i] = cpu_to_le32(blk++); + ext4_mark_inode_dirty(handle, inode); +errout: + ext4_journal_stop(handle); + up_write(&EXT4_I(inode)->i_data_sem); + return ret; +} diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index f9b551561d2c..214461e42a05 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -7,7 +7,7 @@ #include "ext4.h" /* Checksumming functions */ -static __u32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp) +static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp) { struct ext4_sb_info *sbi = EXT4_SB(sb); int offset = offsetof(struct mmp_struct, mmp_checksum); @@ -54,7 +54,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) lock_buffer(bh); bh->b_end_io = end_buffer_write_sync; get_bh(bh); - submit_bh(WRITE_SYNC, bh); + submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); sb_end_write(sb); if (unlikely(!buffer_uptodate(bh))) @@ -86,7 +86,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, get_bh(*bh); lock_buffer(*bh); (*bh)->b_end_io = end_buffer_read_sync; - submit_bh(READ_SYNC, *bh); + submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh); wait_on_buffer(*bh); if (!buffer_uptodate(*bh)) { brelse(*bh); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 4e81d47aa8cb..3dcbf364022f 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -32,16 +32,18 @@ */ static inline int get_ext_path(struct inode *inode, ext4_lblk_t lblock, - struct ext4_ext_path **path) + struct ext4_ext_path **orig_path) { int ret = 0; + struct ext4_ext_path *path; - *path = ext4_ext_find_extent(inode, lblock, *path); - if (IS_ERR(*path)) { - ret = PTR_ERR(*path); - *path = NULL; - } else if ((*path)[ext_depth(inode)].p_ext == NULL) + path = ext4_ext_find_extent(inode, lblock, *orig_path); + if (IS_ERR(path)) + ret = PTR_ERR(path); + else if (path[ext_depth(inode)].p_ext == NULL) ret = -ENODATA; + else + *orig_path = path; return ret; } @@ -142,12 +144,13 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path, } /** - * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem + * ext4_double_down_write_data_sem - Acquire two inodes' write lock + * of i_data_sem * * Acquire write lock of i_data_sem of the two inodes */ -static void -double_down_write_data_sem(struct inode *first, struct inode *second) +void +ext4_double_down_write_data_sem(struct inode *first, struct inode *second) { if (first < second) { down_write(&EXT4_I(first)->i_data_sem); @@ -160,14 +163,15 @@ double_down_write_data_sem(struct inode *first, struct inode *second) } /** - * double_up_write_data_sem - Release two inodes' write lock of i_data_sem + * ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem * * @orig_inode: original inode structure to be released its lock first * @donor_inode: donor inode structure to be released its lock second * Release write lock of i_data_sem of two inodes (orig and donor). */ -static void -double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) +void +ext4_double_up_write_data_sem(struct inode *orig_inode, + struct inode *donor_inode) { up_write(&EXT4_I(orig_inode)->i_data_sem); up_write(&EXT4_I(donor_inode)->i_data_sem); @@ -405,18 +409,7 @@ mext_insert_extents(handle_t *handle, struct inode *orig_inode, mext_insert_inside_block(o_start, o_end, start_ext, new_ext, end_ext, eh, range_to_move); - if (depth) { - ret = ext4_handle_dirty_metadata(handle, orig_inode, - orig_path->p_bh); - if (ret) - return ret; - } else { - ret = ext4_mark_inode_dirty(handle, orig_inode); - if (ret < 0) - return ret; - } - - return 0; + return ext4_ext_dirty(handle, orig_inode, orig_path); } /** @@ -611,24 +604,25 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, { struct ext4_ext_path *path = NULL; struct ext4_extent *ext; + int ret = 0; ext4_lblk_t last = from + count; while (from < last) { *err = get_ext_path(inode, from, &path); if (*err) - return 0; + goto out; ext = path[ext_depth(inode)].p_ext; - if (!ext) { - ext4_ext_drop_refs(path); - return 0; - } - if (uninit != ext4_ext_is_uninitialized(ext)) { - ext4_ext_drop_refs(path); - return 0; - } + if (uninit != ext4_ext_is_uninitialized(ext)) + goto out; from += ext4_ext_get_actual_len(ext); ext4_ext_drop_refs(path); } - return 1; + ret = 1; +out: + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } + return ret; } /** @@ -666,6 +660,14 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, int replaced_count = 0; int dext_alen; + *err = ext4_es_remove_extent(orig_inode, from, count); + if (*err) + goto out; + + *err = ext4_es_remove_extent(donor_inode, from, count); + if (*err) + goto out; + /* Get the original extent for the block "orig_off" */ *err = get_ext_path(orig_inode, orig_off, &orig_path); if (*err) @@ -726,6 +728,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, donor_off += dext_alen; orig_off += dext_alen; + BUG_ON(replaced_count > count); /* Already moved the expected blocks */ if (replaced_count >= count) break; @@ -803,7 +806,13 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2, page_cache_release(page[0]); return -ENOMEM; } - + /* + * grab_cache_page_write_begin() may not wait on page's writeback if + * BDI not demand that. But it is reasonable to be very conservative + * here and explicitly wait on page's writeback + */ + wait_on_page_writeback(page[0]); + wait_on_page_writeback(page[1]); if (inode1 > inode2) { struct page *tmp; tmp = page[0]; @@ -845,7 +854,6 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to) if (buffer_uptodate(bh)) continue; if (!buffer_mapped(bh)) { - int err = 0; err = ext4_get_block(inode, block, bh, 0); if (err) { SetPageError(page); @@ -965,7 +973,7 @@ again: * necessary, just swap data blocks between orig and donor. */ if (uninit) { - double_down_write_data_sem(orig_inode, donor_inode); + ext4_double_down_write_data_sem(orig_inode, donor_inode); /* If any of extents in range became initialized we have to * fallback to data copying */ uninit = mext_check_coverage(orig_inode, orig_blk_offset, @@ -979,7 +987,7 @@ again: goto drop_data_sem; if (!uninit) { - double_up_write_data_sem(orig_inode, donor_inode); + ext4_double_up_write_data_sem(orig_inode, donor_inode); goto data_copy; } if ((page_has_private(pagep[0]) && @@ -993,7 +1001,7 @@ again: donor_inode, orig_blk_offset, block_len_in_page, err); drop_data_sem: - double_up_write_data_sem(orig_inode, donor_inode); + ext4_double_up_write_data_sem(orig_inode, donor_inode); goto unlock_pages; } data_copy: @@ -1022,7 +1030,7 @@ data_copy: } /* Perform all necessary steps similar write_begin()/write_end() * but keeping in mind that i_size will not change */ - *err = __block_write_begin(pagep[0], from, from + replaced_size, + *err = __block_write_begin(pagep[0], from, replaced_size, ext4_get_block); if (!*err) *err = block_commit_write(pagep[0], from, from + replaced_size); @@ -1054,11 +1062,11 @@ repair_branches: * Extents are swapped already, but we are not able to copy data. * Try to swap extents to it's original places */ - double_down_write_data_sem(orig_inode, donor_inode); + ext4_double_down_write_data_sem(orig_inode, donor_inode); replaced_count = mext_replace_branches(handle, donor_inode, orig_inode, orig_blk_offset, block_len_in_page, &err2); - double_up_write_data_sem(orig_inode, donor_inode); + ext4_double_up_write_data_sem(orig_inode, donor_inode); if (replaced_count != block_len_in_page) { EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset), "Unable to copy data block," @@ -1198,15 +1206,15 @@ mext_check_arguments(struct inode *orig_inode, } /** - * mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2 + * ext4_inode_double_lock - Lock i_mutex on both @inode1 and @inode2 * * @inode1: the inode structure * @inode2: the inode structure * * Lock two inodes' i_mutex */ -static void -mext_inode_double_lock(struct inode *inode1, struct inode *inode2) +void +ext4_inode_double_lock(struct inode *inode1, struct inode *inode2) { BUG_ON(inode1 == inode2); if (inode1 < inode2) { @@ -1219,15 +1227,15 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2) } /** - * mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2 + * ext4_inode_double_unlock - Release i_mutex on both @inode1 and @inode2 * * @inode1: the inode that is released first * @inode2: the inode that is released second * */ -static void -mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) +void +ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2) { mutex_unlock(&inode1->i_mutex); mutex_unlock(&inode2->i_mutex); @@ -1322,7 +1330,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, return -EINVAL; } /* Protect orig and donor inodes against a truncate */ - mext_inode_double_lock(orig_inode, donor_inode); + ext4_inode_double_lock(orig_inode, donor_inode); /* Wait for all existing dio workers */ ext4_inode_block_unlocked_dio(orig_inode); @@ -1331,7 +1339,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, inode_dio_wait(donor_inode); /* Protect extent tree against block allocations via delalloc */ - double_down_write_data_sem(orig_inode, donor_inode); + ext4_double_down_write_data_sem(orig_inode, donor_inode); /* Check the filesystem environment whether move_extent can be done */ ret = mext_check_arguments(orig_inode, donor_inode, orig_start, donor_start, &len); @@ -1455,7 +1463,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, * b. racing with ->readpage, ->write_begin, and ext4_get_block * in move_extent_per_page */ - double_up_write_data_sem(orig_inode, donor_inode); + ext4_double_up_write_data_sem(orig_inode, donor_inode); while (orig_page_offset <= seq_end_page) { @@ -1489,7 +1497,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, block_len_in_page = rest_blocks; } - double_down_write_data_sem(orig_inode, donor_inode); + ext4_double_down_write_data_sem(orig_inode, donor_inode); if (ret < 0) break; @@ -1527,10 +1535,10 @@ out: ext4_ext_drop_refs(holecheck_path); kfree(holecheck_path); } - double_up_write_data_sem(orig_inode, donor_inode); + ext4_double_up_write_data_sem(orig_inode, donor_inode); ext4_inode_resume_unlocked_dio(orig_inode); ext4_inode_resume_unlocked_dio(donor_inode); - mext_inode_double_unlock(orig_inode, donor_inode); + ext4_inode_double_unlock(orig_inode, donor_inode); return ret; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 3825d6aa8336..6653fc35ecb7 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -416,15 +416,16 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent, { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); - __u32 csum, old_csum; + __u32 csum; + __le32 save_csum; int size; size = count_offset + (count * sizeof(struct dx_entry)); - old_csum = t->dt_checksum; + save_csum = t->dt_checksum; t->dt_checksum = 0; csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail)); - t->dt_checksum = old_csum; + t->dt_checksum = save_csum; return cpu_to_le32(csum); } @@ -971,6 +972,17 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; + if (ext4_has_inline_data(dir)) { + int has_inline_data = 1; + count = htree_inlinedir_to_tree(dir_file, dir, 0, + &hinfo, start_hash, + start_minor_hash, + &has_inline_data); + if (has_inline_data) { + *next_hash = ~0; + return count; + } + } count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, start_hash, start_minor_hash); *next_hash = ~0; @@ -1455,24 +1467,6 @@ struct dentry *ext4_get_parent(struct dentry *child) return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino)); } -#define S_SHIFT 12 -static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = { - [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, - [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, - [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, - [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV, - [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO, - [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK, - [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK, -}; - -static inline void ext4_set_de_type(struct super_block *sb, - struct ext4_dir_entry_2 *de, - umode_t mode) { - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE)) - de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; -} - /* * Move count entries from end of map between two memory locations. * Returns pointer to last entry moved. @@ -2251,8 +2245,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode, dquot_initialize(dir); credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); @@ -2286,8 +2279,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry, dquot_initialize(dir); credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); @@ -2396,8 +2388,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) dquot_initialize(dir); credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode, &dentry->d_name, @@ -2826,8 +2817,7 @@ static int ext4_symlink(struct inode *dir, * quota blocks, sb is already counted in previous macros). */ credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3; } retry: inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO, diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 809b31003ecc..19599bded62a 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -18,6 +18,7 @@ #include <linux/pagevec.h> #include <linux/mpage.h> #include <linux/namei.h> +#include <linux/aio.h> #include <linux/uio.h> #include <linux/bio.h> #include <linux/workqueue.h> @@ -29,57 +30,60 @@ #include "xattr.h" #include "acl.h" -static struct kmem_cache *io_page_cachep, *io_end_cachep; +static struct kmem_cache *io_end_cachep; int __init ext4_init_pageio(void) { - io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT); - if (io_page_cachep == NULL) - return -ENOMEM; io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT); - if (io_end_cachep == NULL) { - kmem_cache_destroy(io_page_cachep); + if (io_end_cachep == NULL) return -ENOMEM; - } return 0; } void ext4_exit_pageio(void) { kmem_cache_destroy(io_end_cachep); - kmem_cache_destroy(io_page_cachep); } -void ext4_ioend_wait(struct inode *inode) +/* + * This function is called by ext4_evict_inode() to make sure there is + * no more pending I/O completion work left to do. + */ +void ext4_ioend_shutdown(struct inode *inode) { wait_queue_head_t *wq = ext4_ioend_wq(inode); wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); + /* + * We need to make sure the work structure is finished being + * used before we let the inode get destroyed. + */ + if (work_pending(&EXT4_I(inode)->i_unwritten_work)) + cancel_work_sync(&EXT4_I(inode)->i_unwritten_work); } -static void put_io_page(struct ext4_io_page *io_page) +static void ext4_release_io_end(ext4_io_end_t *io_end) { - if (atomic_dec_and_test(&io_page->p_count)) { - end_page_writeback(io_page->p_page); - put_page(io_page->p_page); - kmem_cache_free(io_page_cachep, io_page); - } + BUG_ON(!list_empty(&io_end->list)); + BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); + + if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count)) + wake_up_all(ext4_ioend_wq(io_end->inode)); + if (io_end->flag & EXT4_IO_END_DIRECT) + inode_dio_done(io_end->inode); + if (io_end->iocb) + aio_complete(io_end->iocb, io_end->result, 0); + kmem_cache_free(io_end_cachep, io_end); } -void ext4_free_io_end(ext4_io_end_t *io) +static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) { - int i; + struct inode *inode = io_end->inode; - BUG_ON(!io); - BUG_ON(!list_empty(&io->list)); - BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN); - - for (i = 0; i < io->num_io_pages; i++) - put_io_page(io->pages[i]); - io->num_io_pages = 0; - if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count)) - wake_up_all(ext4_ioend_wq(io->inode)); - kmem_cache_free(io_end_cachep, io); + io_end->flag &= ~EXT4_IO_END_UNWRITTEN; + /* Wake up anyone waiting on unwritten extent conversion */ + if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) + wake_up_all(ext4_ioend_wq(inode)); } /* check a range of space and convert unwritten extents to written. */ @@ -102,13 +106,8 @@ static int ext4_end_io(ext4_io_end_t *io) "(inode %lu, offset %llu, size %zd, error %d)", inode->i_ino, offset, size, ret); } - /* Wake up anyone waiting on unwritten extent conversion */ - if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) - wake_up_all(ext4_ioend_wq(inode)); - if (io->flag & EXT4_IO_END_DIRECT) - inode_dio_done(inode); - if (io->iocb) - aio_complete(io->iocb, io->result, 0); + ext4_clear_io_unwritten_flag(io); + ext4_release_io_end(io); return ret; } @@ -139,7 +138,7 @@ static void dump_completed_IO(struct inode *inode) } /* Add the io_end to per-inode completed end_io list. */ -void ext4_add_complete_io(ext4_io_end_t *io_end) +static void ext4_add_complete_io(ext4_io_end_t *io_end) { struct ext4_inode_info *ei = EXT4_I(io_end->inode); struct workqueue_struct *wq; @@ -176,8 +175,6 @@ static int ext4_do_flush_completed_IO(struct inode *inode) err = ext4_end_io(io); if (unlikely(!ret && err)) ret = err; - io->flag &= ~EXT4_IO_END_UNWRITTEN; - ext4_free_io_end(io); } return ret; } @@ -209,10 +206,43 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) atomic_inc(&EXT4_I(inode)->i_ioend_count); io->inode = inode; INIT_LIST_HEAD(&io->list); + atomic_set(&io->count, 1); } return io; } +void ext4_put_io_end_defer(ext4_io_end_t *io_end) +{ + if (atomic_dec_and_test(&io_end->count)) { + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) { + ext4_release_io_end(io_end); + return; + } + ext4_add_complete_io(io_end); + } +} + +int ext4_put_io_end(ext4_io_end_t *io_end) +{ + int err = 0; + + if (atomic_dec_and_test(&io_end->count)) { + if (io_end->flag & EXT4_IO_END_UNWRITTEN) { + err = ext4_convert_unwritten_extents(io_end->inode, + io_end->offset, io_end->size); + ext4_clear_io_unwritten_flag(io_end); + } + ext4_release_io_end(io_end); + } + return err; +} + +ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) +{ + atomic_inc(&io_end->count); + return io_end; +} + /* * Print an buffer I/O error compatible with the fs/buffer.c. This * provides compatibility with dmesg scrapers that look for a specific @@ -233,45 +263,56 @@ static void ext4_end_bio(struct bio *bio, int error) ext4_io_end_t *io_end = bio->bi_private; struct inode *inode; int i; + int blocksize; sector_t bi_sector = bio->bi_sector; BUG_ON(!io_end); + inode = io_end->inode; + blocksize = 1 << inode->i_blkbits; bio->bi_private = NULL; bio->bi_end_io = NULL; if (test_bit(BIO_UPTODATE, &bio->bi_flags)) error = 0; - bio_put(bio); - - for (i = 0; i < io_end->num_io_pages; i++) { - struct page *page = io_end->pages[i]->p_page; + for (i = 0; i < bio->bi_vcnt; i++) { + struct bio_vec *bvec = &bio->bi_io_vec[i]; + struct page *page = bvec->bv_page; struct buffer_head *bh, *head; - loff_t offset; - loff_t io_end_offset; + unsigned bio_start = bvec->bv_offset; + unsigned bio_end = bio_start + bvec->bv_len; + unsigned under_io = 0; + unsigned long flags; + + if (!page) + continue; if (error) { SetPageError(page); set_bit(AS_EIO, &page->mapping->flags); - head = page_buffers(page); - BUG_ON(!head); - - io_end_offset = io_end->offset + io_end->size; - - offset = (sector_t) page->index << PAGE_CACHE_SHIFT; - bh = head; - do { - if ((offset >= io_end->offset) && - (offset+bh->b_size <= io_end_offset)) - buffer_io_error(bh); - - offset += bh->b_size; - bh = bh->b_this_page; - } while (bh != head); } - - put_io_page(io_end->pages[i]); + bh = head = page_buffers(page); + /* + * We check all buffers in the page under BH_Uptodate_Lock + * to avoid races with other end io clearing async_write flags + */ + local_irq_save(flags); + bit_spin_lock(BH_Uptodate_Lock, &head->b_state); + do { + if (bh_offset(bh) < bio_start || + bh_offset(bh) + blocksize > bio_end) { + if (buffer_async_write(bh)) + under_io++; + continue; + } + clear_buffer_async_write(bh); + if (error) + buffer_io_error(bh); + } while ((bh = bh->b_this_page) != head); + bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); + local_irq_restore(flags); + if (!under_io) + end_page_writeback(page); } - io_end->num_io_pages = 0; - inode = io_end->inode; + bio_put(bio); if (error) { io_end->flag |= EXT4_IO_END_ERROR; @@ -284,12 +325,7 @@ static void ext4_end_bio(struct bio *bio, int error) bi_sector >> (inode->i_blkbits - 9)); } - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { - ext4_free_io_end(io_end); - return; - } - - ext4_add_complete_io(io_end); + ext4_put_io_end_defer(io_end); } void ext4_io_submit(struct ext4_io_submit *io) @@ -303,76 +339,59 @@ void ext4_io_submit(struct ext4_io_submit *io) bio_put(io->io_bio); } io->io_bio = NULL; - io->io_op = 0; +} + +void ext4_io_submit_init(struct ext4_io_submit *io, + struct writeback_control *wbc) +{ + io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); + io->io_bio = NULL; io->io_end = NULL; } -static int io_submit_init(struct ext4_io_submit *io, - struct inode *inode, - struct writeback_control *wbc, - struct buffer_head *bh) +static int io_submit_init_bio(struct ext4_io_submit *io, + struct buffer_head *bh) { - ext4_io_end_t *io_end; - struct page *page = bh->b_page; int nvecs = bio_get_nr_vecs(bh->b_bdev); struct bio *bio; - io_end = ext4_init_io_end(inode, GFP_NOFS); - if (!io_end) - return -ENOMEM; bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; - bio->bi_private = io->io_end = io_end; bio->bi_end_io = ext4_end_bio; - - io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); - + bio->bi_private = ext4_get_io_end(io->io_end); + if (!io->io_end->size) + io->io_end->offset = (bh->b_page->index << PAGE_CACHE_SHIFT) + + bh_offset(bh); io->io_bio = bio; - io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); io->io_next_block = bh->b_blocknr; return 0; } static int io_submit_add_bh(struct ext4_io_submit *io, - struct ext4_io_page *io_page, struct inode *inode, - struct writeback_control *wbc, struct buffer_head *bh) { ext4_io_end_t *io_end; int ret; - if (buffer_new(bh)) { - clear_buffer_new(bh); - unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); - } - if (io->io_bio && bh->b_blocknr != io->io_next_block) { submit_and_retry: ext4_io_submit(io); } if (io->io_bio == NULL) { - ret = io_submit_init(io, inode, wbc, bh); + ret = io_submit_init_bio(io, bh); if (ret) return ret; } - io_end = io->io_end; - if ((io_end->num_io_pages >= MAX_IO_PAGES) && - (io_end->pages[io_end->num_io_pages-1] != io_page)) - goto submit_and_retry; - if (buffer_uninit(bh)) - ext4_set_io_unwritten_flag(inode, io_end); - io->io_end->size += bh->b_size; - io->io_next_block++; ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); if (ret != bh->b_size) goto submit_and_retry; - if ((io_end->num_io_pages == 0) || - (io_end->pages[io_end->num_io_pages-1] != io_page)) { - io_end->pages[io_end->num_io_pages++] = io_page; - atomic_inc(&io_page->p_count); - } + io_end = io->io_end; + if (test_clear_buffer_uninit(bh)) + ext4_set_io_unwritten_flag(inode, io_end); + io_end->size += bh->b_size; + io->io_next_block++; return 0; } @@ -382,33 +401,29 @@ int ext4_bio_write_page(struct ext4_io_submit *io, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; - unsigned block_start, block_end, blocksize; - struct ext4_io_page *io_page; + unsigned block_start, blocksize; struct buffer_head *bh, *head; int ret = 0; + int nr_submitted = 0; blocksize = 1 << inode->i_blkbits; BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); - io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS); - if (!io_page) { - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return -ENOMEM; - } - io_page->p_page = page; - atomic_set(&io_page->p_count, 1); - get_page(page); set_page_writeback(page); ClearPageError(page); - for (bh = head = page_buffers(page), block_start = 0; - bh != head || !block_start; - block_start = block_end, bh = bh->b_this_page) { - - block_end = block_start + blocksize; + /* + * In the first loop we prepare and mark buffers to submit. We have to + * mark all buffers in the page before submitting so that + * end_page_writeback() cannot be called from ext4_bio_end_io() when IO + * on the first buffer finishes and we are still working on submitting + * the second buffer. + */ + bh = head = page_buffers(page); + do { + block_start = bh_offset(bh); if (block_start >= len) { /* * Comments copied from block_write_full_page_endio: @@ -421,7 +436,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io, * mapped, and writes to that region are not written * out to the file." */ - zero_user_segment(page, block_start, block_end); + zero_user_segment(page, block_start, + block_start + blocksize); clear_buffer_dirty(bh); set_buffer_uptodate(bh); continue; @@ -435,7 +451,19 @@ int ext4_bio_write_page(struct ext4_io_submit *io, ext4_io_submit(io); continue; } - ret = io_submit_add_bh(io, io_page, inode, wbc, bh); + if (buffer_new(bh)) { + clear_buffer_new(bh); + unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + } + set_buffer_async_write(bh); + } while ((bh = bh->b_this_page) != head); + + /* Now submit buffers to write */ + bh = head = page_buffers(page); + do { + if (!buffer_async_write(bh)) + continue; + ret = io_submit_add_bh(io, inode, bh); if (ret) { /* * We only get here on ENOMEM. Not much else @@ -445,17 +473,20 @@ int ext4_bio_write_page(struct ext4_io_submit *io, redirty_page_for_writepage(wbc, page); break; } + nr_submitted++; clear_buffer_dirty(bh); + } while ((bh = bh->b_this_page) != head); + + /* Error stopped previous loop? Clean up buffers... */ + if (ret) { + do { + clear_buffer_async_write(bh); + bh = bh->b_this_page; + } while (bh != head); } unlock_page(page); - /* - * If the page was truncated before we could do the writeback, - * or we had a memory allocation error while trying to write - * the first buffer head, we won't have submitted any pages for - * I/O. In that case we need to make sure we've cleared the - * PageWriteback bit from the page to prevent the system from - * wedging later on. - */ - put_io_page(io_page); + /* Nothing submitted - we have to end page writeback */ + if (!nr_submitted) + end_page_writeback(page); return ret; } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index b2c8ee56eb98..b27c96d01965 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -272,7 +272,7 @@ next_group: if (start_blk >= last_blk) goto next_group; group_data[bb_index].block_bitmap = start_blk++; - ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL); + group = ext4_get_group_number(sb, start_blk - 1); group -= group_data[0].group; group_data[group].free_blocks_count--; if (flexbg_size > 1) @@ -284,7 +284,7 @@ next_group: if (start_blk >= last_blk) goto next_group; group_data[ib_index].inode_bitmap = start_blk++; - ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL); + group = ext4_get_group_number(sb, start_blk - 1); group -= group_data[0].group; group_data[group].free_blocks_count--; if (flexbg_size > 1) @@ -296,7 +296,7 @@ next_group: if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk) goto next_group; group_data[it_index].inode_table = start_blk; - ext4_get_group_no_and_offset(sb, start_blk, &group, NULL); + group = ext4_get_group_number(sb, start_blk - 1); group -= group_data[0].group; group_data[group].free_blocks_count -= EXT4_SB(sb)->s_itb_per_group; @@ -392,7 +392,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, ext4_group_t group; int err; - ext4_get_group_no_and_offset(sb, block, &group, NULL); + group = ext4_get_group_number(sb, block); start = ext4_group_first_block_no(sb, group); group -= flex_gd->groups[0].group; @@ -1341,6 +1341,8 @@ static void ext4_update_super(struct super_block *sb, /* Update the global fs size fields */ sbi->s_groups_count += flex_gd->count; + sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, + (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); /* Update the reserved block counts only once the new group is * active. */ @@ -1360,8 +1362,8 @@ static void ext4_update_super(struct super_block *sb, sbi->s_log_groups_per_flex) { ext4_group_t flex_group; flex_group = ext4_flex_group(sbi, group_data[0].group); - atomic_add(EXT4_NUM_B2C(sbi, free_blocks), - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_add(EXT4_NUM_B2C(sbi, free_blocks), + &sbi->s_flex_groups[flex_group].free_clusters); atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count, &sbi->s_flex_groups[flex_group].free_inodes); } @@ -1879,7 +1881,11 @@ retry: /* Nothing need to do */ return 0; - ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset); + n_group = ext4_get_group_number(sb, n_blocks_count - 1); + if (n_group > (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) { + ext4_warning(sb, "resize would cause inodes_count overflow"); + return -EINVAL; + } ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset); n_desc_blocks = num_desc_blocks(sb, n_group + 1); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5e6c87836193..94cc84db7c9a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -81,6 +81,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly); static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); +static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t); #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) static struct file_system_type ext2_fs_type = { @@ -90,6 +91,8 @@ static struct file_system_type ext2_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("ext2"); +MODULE_ALIAS("ext2"); #define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type) #else #define IS_EXT2_SB(sb) (0) @@ -104,6 +107,8 @@ static struct file_system_type ext3_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("ext3"); +MODULE_ALIAS("ext3"); #define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type) #else #define IS_EXT3_SB(sb) (0) @@ -349,10 +354,13 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); int error = is_journal_aborted(journal); - struct ext4_journal_cb_entry *jce, *tmp; + struct ext4_journal_cb_entry *jce; + BUG_ON(txn->t_state == T_FINISHED); spin_lock(&sbi->s_md_lock); - list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) { + while (!list_empty(&txn->t_private_list)) { + jce = list_entry(txn->t_private_list.next, + struct ext4_journal_cb_entry, jce_list); list_del_init(&jce->jce_list); spin_unlock(&sbi->s_md_lock); jce->jce_func(sb, jce, error); @@ -695,22 +703,19 @@ fail: /* * Release the journal device */ -static int ext4_blkdev_put(struct block_device *bdev) +static void ext4_blkdev_put(struct block_device *bdev) { - return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } -static int ext4_blkdev_remove(struct ext4_sb_info *sbi) +static void ext4_blkdev_remove(struct ext4_sb_info *sbi) { struct block_device *bdev; - int ret = -ENODEV; - bdev = sbi->journal_bdev; if (bdev) { - ret = ext4_blkdev_put(bdev); + ext4_blkdev_put(bdev); sbi->journal_bdev = NULL; } - return ret; } static inline struct inode *orphan_list_entry(struct list_head *l) @@ -1798,7 +1803,7 @@ static int options_seq_show(struct seq_file *seq, void *offset) static int options_open_fs(struct inode *inode, struct file *file) { - return single_open(file, options_seq_show, PDE(inode)->data); + return single_open(file, options_seq_show, PDE_DATA(inode)); } static const struct file_operations ext4_seq_options_fops = { @@ -1923,8 +1928,8 @@ static int ext4_fill_flex_info(struct super_block *sb) flex_group = ext4_flex_group(sbi, i); atomic_add(ext4_free_inodes_count(sb, gdp), &sbi->s_flex_groups[flex_group].free_inodes); - atomic_add(ext4_free_group_clusters(sb, gdp), - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_add(ext4_free_group_clusters(sb, gdp), + &sbi->s_flex_groups[flex_group].free_clusters); atomic_add(ext4_used_dirs_count(sb, gdp), &sbi->s_flex_groups[flex_group].used_dirs); } @@ -1944,16 +1949,16 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, if ((sbi->s_es->s_feature_ro_compat & cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) { /* Use new metadata_csum algorithm */ - __u16 old_csum; + __le16 save_csum; __u32 csum32; - old_csum = gdp->bg_checksum; + save_csum = gdp->bg_checksum; gdp->bg_checksum = 0; csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group, sizeof(le_group)); csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, sbi->s_desc_size); - gdp->bg_checksum = old_csum; + gdp->bg_checksum = save_csum; crc = csum32 & 0xFFFF; goto out; @@ -2375,17 +2380,15 @@ struct ext4_attr { int offset; }; -static int parse_strtoul(const char *buf, - unsigned long max, unsigned long *value) +static int parse_strtoull(const char *buf, + unsigned long long max, unsigned long long *value) { - char *endp; - - *value = simple_strtoul(skip_spaces(buf), &endp, 0); - endp = skip_spaces(endp); - if (*endp || *value > max) - return -EINVAL; + int ret; - return 0; + ret = kstrtoull(skip_spaces(buf), 0, value); + if (!ret && *value > max) + ret = -EINVAL; + return ret; } static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a, @@ -2427,11 +2430,13 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a, const char *buf, size_t count) { unsigned long t; + int ret; - if (parse_strtoul(buf, 0x40000000, &t)) - return -EINVAL; + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; - if (t && !is_power_of_2(t)) + if (t && (!is_power_of_2(t) || t > 0x40000000)) return -EINVAL; sbi->s_inode_readahead_blks = t; @@ -2452,13 +2457,36 @@ static ssize_t sbi_ui_store(struct ext4_attr *a, { unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); unsigned long t; + int ret; - if (parse_strtoul(buf, 0xffffffff, &t)) - return -EINVAL; + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; *ui = t; return count; } +static ssize_t reserved_clusters_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long) atomic64_read(&sbi->s_resv_clusters)); +} + +static ssize_t reserved_clusters_store(struct ext4_attr *a, + struct ext4_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned long long val; + int ret; + + if (parse_strtoull(buf, -1ULL, &val)) + return -EINVAL; + ret = ext4_reserve_clusters(sbi, val); + + return ret ? ret : count; +} + static ssize_t trigger_test_error(struct ext4_attr *a, struct ext4_sb_info *sbi, const char *buf, size_t count) @@ -2496,6 +2524,7 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) EXT4_RO_ATTR(delayed_allocation_blocks); EXT4_RO_ATTR(session_write_kbytes); EXT4_RO_ATTR(lifetime_write_kbytes); +EXT4_RW_ATTR(reserved_clusters); EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, inode_readahead_blks_store, s_inode_readahead_blks); EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); @@ -2513,6 +2542,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(delayed_allocation_blocks), ATTR_LIST(session_write_kbytes), ATTR_LIST(lifetime_write_kbytes), + ATTR_LIST(reserved_clusters), ATTR_LIST(inode_readahead_blks), ATTR_LIST(inode_goal), ATTR_LIST(mb_stats), @@ -3188,6 +3218,40 @@ int ext4_calculate_overhead(struct super_block *sb) return 0; } + +static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi) +{ + ext4_fsblk_t resv_clusters; + + /* + * By default we reserve 2% or 4096 clusters, whichever is smaller. + * This should cover the situations where we can not afford to run + * out of space like for example punch hole, or converting + * uninitialized extents in delalloc path. In most cases such + * allocation would require 1, or 2 blocks, higher numbers are + * very rare. + */ + resv_clusters = ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits; + + do_div(resv_clusters, 50); + resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096); + + return resv_clusters; +} + + +static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count) +{ + ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >> + sbi->s_cluster_bits; + + if (count >= clusters) + return -EINVAL; + + atomic64_set(&sbi->s_resv_clusters, count); + return 0; +} + static int ext4_fill_super(struct super_block *sb, void *data, int silent) { char *orig_data = kstrdup(data, GFP_KERNEL); @@ -3522,6 +3586,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); + /* Do we have standard group size of blocksize * 8 blocks ? */ + if (sbi->s_blocks_per_group == blocksize << 3) + set_opt2(sb, STD_GROUP_SIZE); + for (i = 0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); sbi->s_def_hash_version = es->s_def_hash_version; @@ -3694,6 +3762,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_err_report.function = print_daily_error_info; sbi->s_err_report.data = (unsigned long) sb; + /* Register extent status tree shrinker */ + ext4_es_register_shrinker(sb); + err = percpu_counter_init(&sbi->s_freeclusters_counter, ext4_count_free_clusters(sb)); if (!err) { @@ -3719,9 +3790,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_max_writeback_mb_bump = 128; sbi->s_extent_max_zeroout_kb = 32; - /* Register extent status tree shrinker */ - ext4_es_register_shrinker(sb); - /* * set up enough so that it can read an inode */ @@ -3907,6 +3975,13 @@ no_journal: "available"); } + err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sbi)); + if (err) { + ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for " + "reserved pool", ext4_calculate_resv_clusters(sbi)); + goto failed_mount4a; + } + err = ext4_setup_system_zone(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize system " @@ -4006,6 +4081,7 @@ failed_mount_wq: sbi->s_journal = NULL; } failed_mount3: + ext4_es_unregister_shrinker(sb); del_timer(&sbi->s_err_report); if (sbi->s_flex_groups) ext4_kvfree(sbi->s_flex_groups); @@ -4173,7 +4249,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, goto out_bdev; } journal->j_private = sb; - ll_rw_block(READ, 1, &journal->j_sb_buffer); + ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer); wait_on_buffer(journal->j_sb_buffer); if (!buffer_uptodate(journal->j_sb_buffer)) { ext4_msg(sb, KERN_ERR, "I/O error on journal device"); @@ -4738,9 +4814,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) struct super_block *sb = dentry->d_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; - ext4_fsblk_t overhead = 0; + ext4_fsblk_t overhead = 0, resv_blocks; u64 fsid; s64 bfree; + resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters)); if (!test_opt(sb, MINIX_DF)) overhead = sbi->s_overhead; @@ -4752,8 +4829,9 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); /* prevent underflow in case that few free space is available */ buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0)); - buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); - if (buf->f_bfree < ext4_r_blocks_count(es)) + buf->f_bavail = buf->f_bfree - + (ext4_r_blocks_count(es) + resv_blocks); + if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks)) buf->f_bavail = 0; buf->f_files = le32_to_cpu(es->s_inodes_count); buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); @@ -4941,6 +5019,8 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, return PTR_ERR(qf_inode); } + /* Don't account quota for quota files to avoid recursion */ + qf_inode->i_flags |= S_NOQUOTA; err = dquot_enable(qf_inode, type, format_id, flags); iput(qf_inode); @@ -5152,7 +5232,6 @@ static inline int ext2_feature_set_ok(struct super_block *sb) return 0; return 1; } -MODULE_ALIAS("ext2"); #else static inline void register_as_ext2(void) { } static inline void unregister_as_ext2(void) { } @@ -5185,7 +5264,6 @@ static inline int ext3_feature_set_ok(struct super_block *sb) return 0; return 1; } -MODULE_ALIAS("ext3"); #else static inline void register_as_ext3(void) { } static inline void unregister_as_ext3(void) { } @@ -5199,6 +5277,7 @@ static struct file_system_type ext4_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("ext4"); static int __init ext4_init_feat_adverts(void) { diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 3a120b277240..c081e34f717f 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -122,17 +122,18 @@ static __le32 ext4_xattr_block_csum(struct inode *inode, struct ext4_xattr_header *hdr) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - __u32 csum, old; + __u32 csum; + __le32 save_csum; + __le64 dsk_block_nr = cpu_to_le64(block_nr); - old = hdr->h_checksum; + save_csum = hdr->h_checksum; hdr->h_checksum = 0; - block_nr = cpu_to_le64(block_nr); - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&block_nr, - sizeof(block_nr)); + csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr, + sizeof(dsk_block_nr)); csum = ext4_chksum(sbi, csum, (__u8 *)hdr, EXT4_BLOCK_SIZE(inode->i_sb)); - hdr->h_checksum = old; + hdr->h_checksum = save_csum; return cpu_to_le32(csum); } diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index aa25deb5c6cd..c767dbdd7fc4 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -22,6 +22,7 @@ #define EXT4_XATTR_INDEX_LUSTRE 5 #define EXT4_XATTR_INDEX_SECURITY 6 #define EXT4_XATTR_INDEX_SYSTEM 7 +#define EXT4_XATTR_INDEX_RICHACL 8 struct ext4_xattr_header { __le32 h_magic; /* magic number for identification */ diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 137af4255da6..44abc2f286e0 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -299,7 +299,7 @@ int f2fs_acl_chmod(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct posix_acl *acl; int error; - mode_t mode = get_inode_mode(inode); + umode_t mode = get_inode_mode(inode); if (!test_opt(sbi, POSIX_ACL)) return 0; diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 2b6fc131e2ce..b1de01da1a40 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -20,6 +20,7 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include <trace/events/f2fs.h> static struct kmem_cache *orphan_entry_slab; static struct kmem_cache *inode_entry_slab; @@ -57,13 +58,19 @@ repeat: cond_resched(); goto repeat; } - if (f2fs_readpage(sbi, page, index, READ_SYNC)) { + if (PageUptodate(page)) + goto out; + + if (f2fs_readpage(sbi, page, index, READ_SYNC)) + goto repeat; + + lock_page(page); + if (page->mapping != mapping) { f2fs_put_page(page, 1); goto repeat; } +out: mark_page_accessed(page); - - /* We do not allow returning an errorneous page */ return page; } @@ -541,54 +548,44 @@ retry: */ static void block_operations(struct f2fs_sb_info *sbi) { - int t; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .for_reclaim = 0, }; + struct blk_plug plug; - /* Stop renaming operation */ - mutex_lock_op(sbi, RENAME); - mutex_lock_op(sbi, DENTRY_OPS); + blk_start_plug(&plug); -retry_dents: - /* write all the dirty dentry pages */ - sync_dirty_dir_inodes(sbi); +retry_flush_dents: + mutex_lock_all(sbi); - mutex_lock_op(sbi, DATA_WRITE); + /* write all the dirty dentry pages */ if (get_pages(sbi, F2FS_DIRTY_DENTS)) { - mutex_unlock_op(sbi, DATA_WRITE); - goto retry_dents; + mutex_unlock_all(sbi); + sync_dirty_dir_inodes(sbi); + goto retry_flush_dents; } - /* block all the operations */ - for (t = DATA_NEW; t <= NODE_TRUNC; t++) - mutex_lock_op(sbi, t); - - mutex_lock(&sbi->write_inode); - /* * POR: we should ensure that there is no dirty node pages * until finishing nat/sit flush. */ -retry: - sync_node_pages(sbi, 0, &wbc); - - mutex_lock_op(sbi, NODE_WRITE); +retry_flush_nodes: + mutex_lock(&sbi->node_write); if (get_pages(sbi, F2FS_DIRTY_NODES)) { - mutex_unlock_op(sbi, NODE_WRITE); - goto retry; + mutex_unlock(&sbi->node_write); + sync_node_pages(sbi, 0, &wbc); + goto retry_flush_nodes; } - mutex_unlock(&sbi->write_inode); + blk_finish_plug(&plug); } static void unblock_operations(struct f2fs_sb_info *sbi) { - int t; - for (t = NODE_WRITE; t >= RENAME; t--) - mutex_unlock_op(sbi, t); + mutex_unlock(&sbi->node_write); + mutex_unlock_all(sbi); } static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) @@ -727,9 +724,13 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long long ckpt_ver; + trace_f2fs_write_checkpoint(sbi->sb, is_umount, "start block_ops"); + mutex_lock(&sbi->cp_mutex); block_operations(sbi); + trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops"); + f2fs_submit_bio(sbi, DATA, true); f2fs_submit_bio(sbi, NODE, true); f2fs_submit_bio(sbi, META, true); @@ -746,13 +747,13 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) flush_nat_entries(sbi); flush_sit_entries(sbi); - reset_victim_segmap(sbi); - /* unlock all the fs_lock[] in do_checkpoint() */ do_checkpoint(sbi, is_umount); unblock_operations(sbi); mutex_unlock(&sbi->cp_mutex); + + trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint"); } void init_orphan_info(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7bd22a201125..91ff93b0b0f4 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -12,6 +12,7 @@ #include <linux/f2fs_fs.h> #include <linux/buffer_head.h> #include <linux/mpage.h> +#include <linux/aio.h> #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/blkdev.h> @@ -21,6 +22,7 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include <trace/events/f2fs.h> /* * Lock ordering for the change of data block address: @@ -54,6 +56,8 @@ int reserve_new_block(struct dnode_of_data *dn) if (!inc_valid_block_count(sbi, dn->inode, 1)) return -ENOSPC; + trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node); + __set_data_blkaddr(dn, NEW_ADDR); dn->data_blkaddr = NEW_ADDR; sync_inode_page(dn); @@ -133,7 +137,7 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn) goto end_update; } - /* Frone merge */ + /* Front merge */ if (fofs == start_fofs - 1 && blk_addr == start_blkaddr - 1) { fi->ext.fofs--; fi->ext.blk_addr--; @@ -169,7 +173,7 @@ end_update: return; } -struct page *find_data_page(struct inode *inode, pgoff_t index) +struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct address_space *mapping = inode->i_mapping; @@ -183,7 +187,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index) f2fs_put_page(page, 0); set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, RDONLY_NODE); + err = get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err) return ERR_PTR(err); f2fs_put_dnode(&dn); @@ -199,12 +203,20 @@ struct page *find_data_page(struct inode *inode, pgoff_t index) if (!page) return ERR_PTR(-ENOMEM); - err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); - if (err) { - f2fs_put_page(page, 1); - return ERR_PTR(err); + if (PageUptodate(page)) { + unlock_page(page); + return page; + } + + err = f2fs_readpage(sbi, page, dn.data_blkaddr, + sync ? READ_SYNC : READA); + if (sync) { + wait_on_page_locked(page); + if (!PageUptodate(page)) { + f2fs_put_page(page, 0); + return ERR_PTR(-EIO); + } } - unlock_page(page); return page; } @@ -222,14 +234,14 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index) int err; set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, RDONLY_NODE); + err = get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err) return ERR_PTR(err); f2fs_put_dnode(&dn); if (dn.data_blkaddr == NULL_ADDR) return ERR_PTR(-ENOENT); - +repeat: page = grab_cache_page(mapping, index); if (!page) return ERR_PTR(-ENOMEM); @@ -241,9 +253,17 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index) BUG_ON(dn.data_blkaddr == NULL_ADDR); err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); - if (err) { - f2fs_put_page(page, 1); + if (err) return ERR_PTR(err); + + lock_page(page); + if (!PageUptodate(page)) { + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } + if (page->mapping != mapping) { + f2fs_put_page(page, 1); + goto repeat; } return page; } @@ -251,6 +271,9 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index) /* * Caller ensures that this data page is never allocated. * A new zero-filled data page is allocated in the page cache. + * + * Also, caller should grab and release a mutex by calling mutex_lock_op() and + * mutex_unlock_op(). */ struct page *get_new_data_page(struct inode *inode, pgoff_t index, bool new_i_size) @@ -262,7 +285,7 @@ struct page *get_new_data_page(struct inode *inode, pgoff_t index, int err; set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, 0); + err = get_dnode_of_data(&dn, index, ALLOC_NODE); if (err) return ERR_PTR(err); @@ -273,7 +296,7 @@ struct page *get_new_data_page(struct inode *inode, pgoff_t index, } } f2fs_put_dnode(&dn); - +repeat: page = grab_cache_page(mapping, index); if (!page) return ERR_PTR(-ENOMEM); @@ -283,14 +306,21 @@ struct page *get_new_data_page(struct inode *inode, pgoff_t index, if (dn.data_blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_CACHE_SIZE); + SetPageUptodate(page); } else { err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); - if (err) { - f2fs_put_page(page, 1); + if (err) return ERR_PTR(err); + lock_page(page); + if (!PageUptodate(page)) { + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } + if (page->mapping != mapping) { + f2fs_put_page(page, 1); + goto repeat; } } - SetPageUptodate(page); if (new_i_size && i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) { @@ -325,21 +355,15 @@ static void read_end_io(struct bio *bio, int err) /* * Fill the locked page with data located in the block address. - * Read operation is synchronous, and caller must unlock the page. + * Return unlocked page. */ int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page, block_t blk_addr, int type) { struct block_device *bdev = sbi->sb->s_bdev; - bool sync = (type == READ_SYNC); struct bio *bio; - /* This page can be already read by other threads */ - if (PageUptodate(page)) { - if (!sync) - unlock_page(page); - return 0; - } + trace_f2fs_readpage(page, blk_addr, type); down_read(&sbi->bio_sem); @@ -354,18 +378,12 @@ int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page, kfree(bio->bi_private); bio_put(bio); up_read(&sbi->bio_sem); + f2fs_put_page(page, 1); return -EFAULT; } submit_bio(type, bio); up_read(&sbi->bio_sem); - - /* wait for read completion if sync */ - if (sync) { - lock_page(page); - if (PageError(page)) - return -EIO; - } return 0; } @@ -387,14 +405,18 @@ static int get_data_block_ro(struct inode *inode, sector_t iblock, /* Get the page offset from the block offset(iblock) */ pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits)); - if (check_extent_cache(inode, pgofs, bh_result)) + if (check_extent_cache(inode, pgofs, bh_result)) { + trace_f2fs_get_data_block(inode, iblock, bh_result, 0); return 0; + } /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, pgofs, RDONLY_NODE); - if (err) + err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA); + if (err) { + trace_f2fs_get_data_block(inode, iblock, bh_result, err); return (err == -ENOENT) ? 0 : err; + } /* It does not support data allocation */ BUG_ON(create); @@ -419,6 +441,7 @@ static int get_data_block_ro(struct inode *inode, sector_t iblock, bh_result->b_size = (i << blkbits); } f2fs_put_dnode(&dn); + trace_f2fs_get_data_block(inode, iblock, bh_result, 0); return 0; } @@ -437,13 +460,12 @@ static int f2fs_read_data_pages(struct file *file, int do_write_data_page(struct page *page) { struct inode *inode = page->mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); block_t old_blk_addr, new_blk_addr; struct dnode_of_data dn; int err = 0; set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, page->index, RDONLY_NODE); + err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); if (err) return err; @@ -467,8 +489,6 @@ int do_write_data_page(struct page *page) write_data_page(inode, page, &dn, old_blk_addr, &new_blk_addr); update_extent_cache(new_blk_addr, &dn); - F2FS_I(inode)->data_version = - le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver); } out_writepage: f2fs_put_dnode(&dn); @@ -484,10 +504,11 @@ static int f2fs_write_data_page(struct page *page, const pgoff_t end_index = ((unsigned long long) i_size) >> PAGE_CACHE_SHIFT; unsigned offset; + bool need_balance_fs = false; int err = 0; if (page->index < end_index) - goto out; + goto write; /* * If the offset is out-of-range of file size, @@ -499,50 +520,46 @@ static int f2fs_write_data_page(struct page *page, dec_page_count(sbi, F2FS_DIRTY_DENTS); inode_dec_dirty_dents(inode); } - goto unlock_out; + goto out; } zero_user_segment(page, offset, PAGE_CACHE_SIZE); -out: - if (sbi->por_doing) - goto redirty_out; - - if (wbc->for_reclaim && !S_ISDIR(inode->i_mode) && !is_cold_data(page)) +write: + if (sbi->por_doing) { + err = AOP_WRITEPAGE_ACTIVATE; goto redirty_out; + } - mutex_lock_op(sbi, DATA_WRITE); + /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { dec_page_count(sbi, F2FS_DIRTY_DENTS); inode_dec_dirty_dents(inode); + err = do_write_data_page(page); + } else { + int ilock = mutex_lock_op(sbi); + err = do_write_data_page(page); + mutex_unlock_op(sbi, ilock); + need_balance_fs = true; } - err = do_write_data_page(page); - if (err && err != -ENOENT) { - wbc->pages_skipped++; - set_page_dirty(page); - } - mutex_unlock_op(sbi, DATA_WRITE); + if (err == -ENOENT) + goto out; + else if (err) + goto redirty_out; if (wbc->for_reclaim) f2fs_submit_bio(sbi, DATA, true); - if (err == -ENOENT) - goto unlock_out; - clear_cold_data(page); +out: unlock_page(page); - - if (!wbc->for_reclaim && !S_ISDIR(inode->i_mode)) + if (need_balance_fs) f2fs_balance_fs(sbi); return 0; -unlock_out: - unlock_page(page); - return (err == -ENOENT) ? 0 : err; - redirty_out: wbc->pages_skipped++; set_page_dirty(page); - return AOP_WRITEPAGE_ACTIVATE; + return err; } #define MAX_DESIRED_PAGES_WP 4096 @@ -561,19 +578,26 @@ static int f2fs_write_data_pages(struct address_space *mapping, { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + bool locked = false; int ret; long excess_nrtw = 0, desired_nrtw; + /* deal with chardevs and other special file */ + if (!mapping->a_ops->writepage) + return 0; + if (wbc->nr_to_write < MAX_DESIRED_PAGES_WP) { desired_nrtw = MAX_DESIRED_PAGES_WP; excess_nrtw = desired_nrtw - wbc->nr_to_write; wbc->nr_to_write = desired_nrtw; } - if (!S_ISDIR(inode->i_mode)) + if (!S_ISDIR(inode->i_mode)) { mutex_lock(&sbi->writepages); + locked = true; + } ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); - if (!S_ISDIR(inode->i_mode)) + if (locked) mutex_unlock(&sbi->writepages); f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL)); @@ -593,39 +617,33 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT; struct dnode_of_data dn; int err = 0; + int ilock; /* for nobh_write_end */ *fsdata = NULL; f2fs_balance_fs(sbi); - +repeat: page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; *pagep = page; - mutex_lock_op(sbi, DATA_NEW); + ilock = mutex_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, 0); - if (err) { - mutex_unlock_op(sbi, DATA_NEW); - f2fs_put_page(page, 1); - return err; - } + err = get_dnode_of_data(&dn, index, ALLOC_NODE); + if (err) + goto err; - if (dn.data_blkaddr == NULL_ADDR) { + if (dn.data_blkaddr == NULL_ADDR) err = reserve_new_block(&dn); - if (err) { - f2fs_put_dnode(&dn); - mutex_unlock_op(sbi, DATA_NEW); - f2fs_put_page(page, 1); - return err; - } - } + f2fs_put_dnode(&dn); + if (err) + goto err; - mutex_unlock_op(sbi, DATA_NEW); + mutex_unlock_op(sbi, ilock); if ((len == PAGE_CACHE_SIZE) || PageUptodate(page)) return 0; @@ -636,21 +654,34 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, /* Reading beyond i_size is simple: memset to zero */ zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE); - return 0; + goto out; } if (dn.data_blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_CACHE_SIZE); } else { err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); - if (err) { - f2fs_put_page(page, 1); + if (err) return err; + lock_page(page); + if (!PageUptodate(page)) { + f2fs_put_page(page, 1); + return -EIO; + } + if (page->mapping != mapping) { + f2fs_put_page(page, 1); + goto repeat; } } +out: SetPageUptodate(page); clear_cold_data(page); return 0; + +err: + mutex_unlock_op(sbi, ilock); + f2fs_put_page(page, 1); + return err; } static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, @@ -681,7 +712,7 @@ static void f2fs_invalidate_data_page(struct page *page, unsigned long offset) static int f2fs_release_data_page(struct page *page, gfp_t wait) { ClearPagePrivate(page); - return 0; + return 1; } static int f2fs_set_data_page_dirty(struct page *page) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 025b9e2f935d..8d9943786c31 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -13,7 +13,6 @@ #include <linux/fs.h> #include <linux/backing-dev.h> -#include <linux/proc_fs.h> #include <linux/f2fs_fs.h> #include <linux/blkdev.h> #include <linux/debugfs.h> @@ -106,7 +105,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi) } } mutex_unlock(&sit_i->sentry_lock); - dist = sbi->total_sections * hblks_per_sec * hblks_per_sec / 100; + dist = TOTAL_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100; si->bimodal = bimodal / dist; if (si->dirty_count) si->avg_vblocks = total_vblocks / ndirty; @@ -138,14 +137,13 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi)); si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * TOTAL_SEGS(sbi); if (sbi->segs_per_sec > 1) - si->base_mem += sbi->total_sections * - sizeof(struct sec_entry); + si->base_mem += TOTAL_SECS(sbi) * sizeof(struct sec_entry); si->base_mem += __bitmap_size(sbi, SIT_BITMAP); /* build free segmap */ si->base_mem += sizeof(struct free_segmap_info); si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi)); - si->base_mem += f2fs_bitmap_size(sbi->total_sections); + si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi)); /* build curseg */ si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE; @@ -154,7 +152,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) /* build dirty segmap */ si->base_mem += sizeof(struct dirty_seglist_info); si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(TOTAL_SEGS(sbi)); - si->base_mem += 2 * f2fs_bitmap_size(TOTAL_SEGS(sbi)); + si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi)); /* buld nm */ si->base_mem += sizeof(struct f2fs_nm_info); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index a1f38443ecee..1ac6b93036b7 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -60,7 +60,7 @@ static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = { static void set_de_type(struct f2fs_dir_entry *de, struct inode *inode) { - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; } @@ -148,7 +148,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, for (; bidx < end_block; bidx++) { /* no need to allocate new dentry pages to all the indices */ - dentry_page = find_data_page(dir, bidx); + dentry_page = find_data_page(dir, bidx, true); if (IS_ERR(dentry_page)) { room = true; continue; @@ -189,6 +189,9 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, unsigned int max_depth; unsigned int level; + if (namelen > F2FS_NAME_LEN) + return NULL; + if (npages == 0) return NULL; @@ -246,9 +249,6 @@ ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr) void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, struct page *page, struct inode *inode) { - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); - - mutex_lock_op(sbi, DENTRY_OPS); lock_page(page); wait_on_page_writeback(page); de->ino = cpu_to_le32(inode->i_ino); @@ -262,7 +262,6 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, F2FS_I(inode)->i_pino = dir->i_ino; f2fs_put_page(page, 1); - mutex_unlock_op(sbi, DENTRY_OPS); } void init_dent_inode(const struct qstr *name, struct page *ipage) @@ -281,6 +280,43 @@ void init_dent_inode(const struct qstr *name, struct page *ipage) set_page_dirty(ipage); } +static int make_empty_dir(struct inode *inode, struct inode *parent) +{ + struct page *dentry_page; + struct f2fs_dentry_block *dentry_blk; + struct f2fs_dir_entry *de; + void *kaddr; + + dentry_page = get_new_data_page(inode, 0, true); + if (IS_ERR(dentry_page)) + return PTR_ERR(dentry_page); + + kaddr = kmap_atomic(dentry_page); + dentry_blk = (struct f2fs_dentry_block *)kaddr; + + de = &dentry_blk->dentry[0]; + de->name_len = cpu_to_le16(1); + de->hash_code = 0; + de->ino = cpu_to_le32(inode->i_ino); + memcpy(dentry_blk->filename[0], ".", 1); + set_de_type(de, inode); + + de = &dentry_blk->dentry[1]; + de->hash_code = 0; + de->name_len = cpu_to_le16(2); + de->ino = cpu_to_le32(parent->i_ino); + memcpy(dentry_blk->filename[1], "..", 2); + set_de_type(de, inode); + + test_and_set_bit_le(0, &dentry_blk->dentry_bitmap); + test_and_set_bit_le(1, &dentry_blk->dentry_bitmap); + kunmap_atomic(kaddr); + + set_page_dirty(dentry_page); + f2fs_put_page(dentry_page, 1); + return 0; +} + static int init_inode_metadata(struct inode *inode, struct inode *dir, const struct qstr *name) { @@ -291,7 +327,7 @@ static int init_inode_metadata(struct inode *inode, return err; if (S_ISDIR(inode->i_mode)) { - err = f2fs_make_empty(inode, dir); + err = make_empty_dir(inode, dir); if (err) { remove_inode_page(inode); return err; @@ -314,7 +350,7 @@ static int init_inode_metadata(struct inode *inode, } if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) { inc_nlink(inode); - f2fs_write_inode(inode, NULL); + update_inode_page(inode); } return 0; } @@ -338,7 +374,7 @@ static void update_parent_metadata(struct inode *dir, struct inode *inode, } if (need_dir_update) - f2fs_write_inode(dir, NULL); + update_inode_page(dir); else mark_inode_dirty(dir); @@ -370,6 +406,10 @@ next: goto next; } +/* + * Caller should grab and release a mutex by calling mutex_lock_op() and + * mutex_unlock_op(). + */ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *inode) { unsigned int bit_pos; @@ -379,7 +419,6 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *in f2fs_hash_t dentry_hash; struct f2fs_dir_entry *de; unsigned int nbucket, nblock; - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); size_t namelen = name->len; struct page *dentry_page = NULL; struct f2fs_dentry_block *dentry_blk = NULL; @@ -409,12 +448,9 @@ start: bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket)); for (block = bidx; block <= (bidx + nblock - 1); block++) { - mutex_lock_op(sbi, DENTRY_OPS); dentry_page = get_new_data_page(dir, block, true); - if (IS_ERR(dentry_page)) { - mutex_unlock_op(sbi, DENTRY_OPS); + if (IS_ERR(dentry_page)) return PTR_ERR(dentry_page); - } dentry_blk = kmap(dentry_page); bit_pos = room_for_filename(dentry_blk, slots); @@ -423,7 +459,6 @@ start: kunmap(dentry_page); f2fs_put_page(dentry_page, 1); - mutex_unlock_op(sbi, DENTRY_OPS); } /* Move to next level to find the empty slot for new dentry */ @@ -453,7 +488,6 @@ add_dentry: fail: kunmap(dentry_page); f2fs_put_page(dentry_page, 1); - mutex_unlock_op(sbi, DENTRY_OPS); return err; } @@ -473,8 +507,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, void *kaddr = page_address(page); int i; - mutex_lock_op(sbi, DENTRY_OPS); - lock_page(page); wait_on_page_writeback(page); @@ -494,7 +526,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, if (inode && S_ISDIR(inode->i_mode)) { drop_nlink(dir); - f2fs_write_inode(dir, NULL); + update_inode_page(dir); } else { mark_inode_dirty(dir); } @@ -506,7 +538,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, drop_nlink(inode); i_size_write(inode, 0); } - f2fs_write_inode(inode, NULL); + update_inode_page(inode); + if (inode->i_nlink == 0) add_orphan_inode(sbi, inode->i_ino); } @@ -519,45 +552,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, inode_dec_dirty_dents(dir); } f2fs_put_page(page, 1); - - mutex_unlock_op(sbi, DENTRY_OPS); -} - -int f2fs_make_empty(struct inode *inode, struct inode *parent) -{ - struct page *dentry_page; - struct f2fs_dentry_block *dentry_blk; - struct f2fs_dir_entry *de; - void *kaddr; - - dentry_page = get_new_data_page(inode, 0, true); - if (IS_ERR(dentry_page)) - return PTR_ERR(dentry_page); - - kaddr = kmap_atomic(dentry_page); - dentry_blk = (struct f2fs_dentry_block *)kaddr; - - de = &dentry_blk->dentry[0]; - de->name_len = cpu_to_le16(1); - de->hash_code = f2fs_dentry_hash(".", 1); - de->ino = cpu_to_le32(inode->i_ino); - memcpy(dentry_blk->filename[0], ".", 1); - set_de_type(de, inode); - - de = &dentry_blk->dentry[1]; - de->hash_code = f2fs_dentry_hash("..", 2); - de->name_len = cpu_to_le16(2); - de->ino = cpu_to_le32(parent->i_ino); - memcpy(dentry_blk->filename[1], "..", 2); - set_de_type(de, inode); - - test_and_set_bit_le(0, &dentry_blk->dentry_bitmap); - test_and_set_bit_le(1, &dentry_blk->dentry_bitmap); - kunmap_atomic(kaddr); - - set_page_dirty(dentry_page); - f2fs_put_page(dentry_page, 1); - return 0; } bool f2fs_empty_dir(struct inode *dir) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index cc2213afdcc7..20aab02f2a42 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -125,11 +125,15 @@ static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i) * file keeping -1 as its node offset to * distinguish from index node blocks. */ -#define RDONLY_NODE 1 /* - * specify a read-only mode when getting - * a node block. 0 is read-write mode. - * used by get_dnode_of_data(). +enum { + ALLOC_NODE, /* allocate a new node page if needed */ + LOOKUP_NODE, /* look up a node without readahead */ + LOOKUP_NODE_RA, /* + * look up a node with readahead called + * by get_datablock_ro. */ +}; + #define F2FS_LINK_MAX 32000 /* maximum link count per file */ /* for in-memory extent cache entry */ @@ -137,13 +141,14 @@ struct extent_info { rwlock_t ext_lock; /* rwlock for consistency */ unsigned int fofs; /* start offset in a file */ u32 blk_addr; /* start block address of the extent */ - unsigned int len; /* lenth of the extent */ + unsigned int len; /* length of the extent */ }; /* * i_advise uses FADVISE_XXX_BIT. We can add additional hints later. */ #define FADVISE_COLD_BIT 0x01 +#define FADVISE_CP_BIT 0x02 struct f2fs_inode_info { struct inode vfs_inode; /* serve a vfs inode */ @@ -155,7 +160,6 @@ struct f2fs_inode_info { /* Use below internally in f2fs*/ unsigned long flags; /* use to pass per-file flags */ - unsigned long long data_version;/* latest version of data for fsync */ atomic_t dirty_dents; /* # of dirty dentry pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ @@ -186,7 +190,6 @@ static inline void set_raw_extent(struct extent_info *ext, struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ - nid_t init_scan_nid; /* the first nid to be scanned */ nid_t next_scan_nid; /* the next nid to be scanned */ /* NAT cache management */ @@ -305,23 +308,12 @@ enum count_type { }; /* - * FS_LOCK nesting subclasses for the lock validator: - * - * The locking order between these classes is - * RENAME -> DENTRY_OPS -> DATA_WRITE -> DATA_NEW - * -> DATA_TRUNC -> NODE_WRITE -> NODE_NEW -> NODE_TRUNC + * Uses as sbi->fs_lock[NR_GLOBAL_LOCKS]. + * The checkpoint procedure blocks all the locks in this fs_lock array. + * Some FS operations grab free locks, and if there is no free lock, + * then wait to grab a lock in a round-robin manner. */ -enum lock_type { - RENAME, /* for renaming operations */ - DENTRY_OPS, /* for directory operations */ - DATA_WRITE, /* for data write */ - DATA_NEW, /* for data allocation */ - DATA_TRUNC, /* for data truncate */ - NODE_NEW, /* for node allocation */ - NODE_TRUNC, /* for node truncate */ - NODE_WRITE, /* for node write */ - NR_LOCK_TYPE, -}; +#define NR_GLOBAL_LOCKS 8 /* * The below are the page types of bios used in submti_bio(). @@ -361,11 +353,13 @@ struct f2fs_sb_info { /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ struct inode *meta_inode; /* cache meta blocks */ - struct mutex cp_mutex; /* for checkpoint procedure */ - struct mutex fs_lock[NR_LOCK_TYPE]; /* for blocking FS operations */ - struct mutex write_inode; /* mutex for write inode */ + struct mutex cp_mutex; /* checkpoint procedure lock */ + struct mutex fs_lock[NR_GLOBAL_LOCKS]; /* blocking FS operations */ + struct mutex node_write; /* locking node writes */ struct mutex writepages; /* mutex for writepages() */ + unsigned char next_lock_num; /* round-robin global locks */ int por_doing; /* recovery is doing or not */ + int on_build_free_nids; /* build_free_nids is doing */ /* for orphan inode management */ struct list_head orphan_inode_list; /* orphan inode list */ @@ -406,6 +400,7 @@ struct f2fs_sb_info { /* for cleaning operations */ struct mutex gc_mutex; /* mutex for GC */ struct f2fs_gc_kthread *gc_thread; /* GC thread */ + unsigned int cur_victim_sec; /* current victim section num */ /* * for stat information. @@ -498,22 +493,51 @@ static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) cp->ckpt_flags = cpu_to_le32(ckpt_flags); } -static inline void mutex_lock_op(struct f2fs_sb_info *sbi, enum lock_type t) +static inline void mutex_lock_all(struct f2fs_sb_info *sbi) { - mutex_lock_nested(&sbi->fs_lock[t], t); + int i = 0; + for (; i < NR_GLOBAL_LOCKS; i++) + mutex_lock(&sbi->fs_lock[i]); } -static inline void mutex_unlock_op(struct f2fs_sb_info *sbi, enum lock_type t) +static inline void mutex_unlock_all(struct f2fs_sb_info *sbi) { - mutex_unlock(&sbi->fs_lock[t]); + int i = 0; + for (; i < NR_GLOBAL_LOCKS; i++) + mutex_unlock(&sbi->fs_lock[i]); +} + +static inline int mutex_lock_op(struct f2fs_sb_info *sbi) +{ + unsigned char next_lock = sbi->next_lock_num % NR_GLOBAL_LOCKS; + int i = 0; + + for (; i < NR_GLOBAL_LOCKS; i++) + if (mutex_trylock(&sbi->fs_lock[i])) + return i; + + mutex_lock(&sbi->fs_lock[next_lock]); + sbi->next_lock_num++; + return next_lock; +} + +static inline void mutex_unlock_op(struct f2fs_sb_info *sbi, int ilock) +{ + if (ilock < 0) + return; + BUG_ON(ilock >= NR_GLOBAL_LOCKS); + mutex_unlock(&sbi->fs_lock[ilock]); } /* * Check whether the given nid is within node id range. */ -static inline void check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) +static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) { - BUG_ON((nid >= NM_I(sbi)->max_nid)); + WARN_ON((nid >= NM_I(sbi)->max_nid)); + if (nid >= NM_I(sbi)->max_nid) + return -EINVAL; + return 0; } #define F2FS_DEFAULT_ALLOCATED_BLOCKS 1 @@ -819,7 +843,6 @@ static inline int f2fs_clear_bit(unsigned int nr, char *addr) /* used for f2fs_inode_info->flags */ enum { FI_NEW_INODE, /* indicate newly allocated inode */ - FI_NEED_CP, /* need to do checkpoint during fsync */ FI_INC_LINK, /* need to increment i_nlink */ FI_ACL_MODE, /* indicate acl mode */ FI_NO_ALLOC, /* should not allocate any blocks */ @@ -872,6 +895,7 @@ long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long); void f2fs_set_inode_flags(struct inode *); struct inode *f2fs_iget(struct super_block *, unsigned long); void update_inode(struct inode *, struct page *); +int update_inode_page(struct inode *); int f2fs_write_inode(struct inode *, struct writeback_control *); void f2fs_evict_inode(struct inode *); @@ -973,7 +997,6 @@ int lookup_journal_in_cursum(struct f2fs_summary_block *, int, unsigned int, int); void flush_sit_entries(struct f2fs_sb_info *); int build_segment_manager(struct f2fs_sb_info *); -void reset_victim_segmap(struct f2fs_sb_info *); void destroy_segment_manager(struct f2fs_sb_info *); /* @@ -1000,7 +1023,7 @@ void destroy_checkpoint_caches(void); */ int reserve_new_block(struct dnode_of_data *); void update_extent_cache(block_t, struct dnode_of_data *); -struct page *find_data_page(struct inode *, pgoff_t); +struct page *find_data_page(struct inode *, pgoff_t, bool); struct page *get_lock_data_page(struct inode *, pgoff_t); struct page *get_new_data_page(struct inode *, pgoff_t, bool); int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int); @@ -1020,7 +1043,7 @@ void destroy_gc_caches(void); /* * recovery.c */ -void recover_fsync_data(struct f2fs_sb_info *); +int recover_fsync_data(struct f2fs_sb_info *); bool space_for_roll_forward(struct f2fs_sb_info *); /* diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 958a46da19ae..1cae864f8dfc 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -13,6 +13,7 @@ #include <linux/stat.h> #include <linux/buffer_head.h> #include <linux/writeback.h> +#include <linux/blkdev.h> #include <linux/falloc.h> #include <linux/types.h> #include <linux/compat.h> @@ -24,6 +25,7 @@ #include "segment.h" #include "xattr.h" #include "acl.h" +#include <trace/events/f2fs.h> static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) @@ -33,19 +35,18 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); block_t old_blk_addr; struct dnode_of_data dn; - int err; + int err, ilock; f2fs_balance_fs(sbi); sb_start_pagefault(inode->i_sb); - mutex_lock_op(sbi, DATA_NEW); - /* block allocation */ + ilock = mutex_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, page->index, 0); + err = get_dnode_of_data(&dn, page->index, ALLOC_NODE); if (err) { - mutex_unlock_op(sbi, DATA_NEW); + mutex_unlock_op(sbi, ilock); goto out; } @@ -55,13 +56,12 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, err = reserve_new_block(&dn); if (err) { f2fs_put_dnode(&dn); - mutex_unlock_op(sbi, DATA_NEW); + mutex_unlock_op(sbi, ilock); goto out; } } f2fs_put_dnode(&dn); - - mutex_unlock_op(sbi, DATA_NEW); + mutex_unlock_op(sbi, ilock); lock_page(page); if (page->mapping != inode->i_mapping || @@ -102,28 +102,10 @@ static const struct vm_operations_struct f2fs_file_vm_ops = { .remap_pages = generic_file_remap_pages, }; -static int need_to_sync_dir(struct f2fs_sb_info *sbi, struct inode *inode) -{ - struct dentry *dentry; - nid_t pino; - - inode = igrab(inode); - dentry = d_find_any_alias(inode); - if (!dentry) { - iput(inode); - return 0; - } - pino = dentry->d_parent->d_inode->i_ino; - dput(dentry); - iput(inode); - return !is_checkpointed_node(sbi, pino); -} - int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - unsigned long long cur_version; int ret = 0; bool need_cp = false; struct writeback_control wbc = { @@ -135,9 +117,12 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (inode->i_sb->s_flags & MS_RDONLY) return 0; + trace_f2fs_sync_file_enter(inode); ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (ret) + if (ret) { + trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); return ret; + } /* guarantee free sections for fsync */ f2fs_balance_fs(sbi); @@ -147,28 +132,18 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) goto out; - mutex_lock(&sbi->cp_mutex); - cur_version = le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver); - mutex_unlock(&sbi->cp_mutex); - - if (F2FS_I(inode)->data_version != cur_version && - !(inode->i_state & I_DIRTY)) - goto out; - F2FS_I(inode)->data_version--; - if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) need_cp = true; - else if (is_inode_flag_set(F2FS_I(inode), FI_NEED_CP)) + else if (is_cp_file(inode)) need_cp = true; else if (!space_for_roll_forward(sbi)) need_cp = true; - else if (need_to_sync_dir(sbi, inode)) + else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino)) need_cp = true; if (need_cp) { /* all the dirty node pages should be flushed for POR */ ret = f2fs_sync_fs(inode->i_sb, 1); - clear_inode_flag(F2FS_I(inode), FI_NEED_CP); } else { /* if there is no written node page, write its inode page */ while (!sync_node_pages(sbi, inode->i_ino, &wbc)) { @@ -178,9 +153,11 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } filemap_fdatawait_range(sbi->node_inode->i_mapping, 0, LONG_MAX); + ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); } out: mutex_unlock(&inode->i_mutex); + trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); return ret; } @@ -216,6 +193,9 @@ static int truncate_data_blocks_range(struct dnode_of_data *dn, int count) sync_inode_page(dn); } dn->ofs_in_node = ofs; + + trace_f2fs_truncate_data_blocks_range(dn->inode, dn->nid, + dn->ofs_in_node, nr_free); return nr_free; } @@ -232,11 +212,15 @@ static void truncate_partial_data_page(struct inode *inode, u64 from) if (!offset) return; - page = find_data_page(inode, from >> PAGE_CACHE_SHIFT); + page = find_data_page(inode, from >> PAGE_CACHE_SHIFT, false); if (IS_ERR(page)) return; lock_page(page); + if (page->mapping != inode->i_mapping) { + f2fs_put_page(page, 1); + return; + } wait_on_page_writeback(page); zero_user(page, offset, PAGE_CACHE_SIZE - offset); set_page_dirty(page); @@ -249,20 +233,22 @@ static int truncate_blocks(struct inode *inode, u64 from) unsigned int blocksize = inode->i_sb->s_blocksize; struct dnode_of_data dn; pgoff_t free_from; - int count = 0; + int count = 0, ilock = -1; int err; + trace_f2fs_truncate_blocks_enter(inode, from); + free_from = (pgoff_t) ((from + blocksize - 1) >> (sbi->log_blocksize)); - mutex_lock_op(sbi, DATA_TRUNC); - + ilock = mutex_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, free_from, RDONLY_NODE); + err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE); if (err) { if (err == -ENOENT) goto free_next; - mutex_unlock_op(sbi, DATA_TRUNC); + mutex_unlock_op(sbi, ilock); + trace_f2fs_truncate_blocks_exit(inode, err); return err; } @@ -273,6 +259,7 @@ static int truncate_blocks(struct inode *inode, u64 from) count -= dn.ofs_in_node; BUG_ON(count < 0); + if (dn.ofs_in_node || IS_INODE(dn.node_page)) { truncate_data_blocks_range(&dn, count); free_from += count; @@ -281,11 +268,12 @@ static int truncate_blocks(struct inode *inode, u64 from) f2fs_put_dnode(&dn); free_next: err = truncate_inode_blocks(inode, free_from); - mutex_unlock_op(sbi, DATA_TRUNC); + mutex_unlock_op(sbi, ilock); /* lastly zero out the first data page */ truncate_partial_data_page(inode, from); + trace_f2fs_truncate_blocks_exit(inode, err); return err; } @@ -295,6 +283,8 @@ void f2fs_truncate(struct inode *inode) S_ISLNK(inode->i_mode))) return; + trace_f2fs_truncate(inode); + if (!truncate_blocks(inode, i_size_read(inode))) { inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); @@ -389,15 +379,16 @@ static void fill_zero(struct inode *inode, pgoff_t index, { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct page *page; + int ilock; if (!len) return; f2fs_balance_fs(sbi); - mutex_lock_op(sbi, DATA_NEW); + ilock = mutex_lock_op(sbi); page = get_new_data_page(inode, index, false); - mutex_unlock_op(sbi, DATA_NEW); + mutex_unlock_op(sbi, ilock); if (!IS_ERR(page)) { wait_on_page_writeback(page); @@ -414,15 +405,10 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) for (index = pg_start; index < pg_end; index++) { struct dnode_of_data dn; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - - f2fs_balance_fs(sbi); - mutex_lock_op(sbi, DATA_TRUNC); set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, RDONLY_NODE); + err = get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err) { - mutex_unlock_op(sbi, DATA_TRUNC); if (err == -ENOENT) continue; return err; @@ -431,7 +417,6 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) if (dn.data_blkaddr != NULL_ADDR) truncate_data_blocks_range(&dn, 1); f2fs_put_dnode(&dn); - mutex_unlock_op(sbi, DATA_TRUNC); } return 0; } @@ -461,12 +446,19 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode) if (pg_start < pg_end) { struct address_space *mapping = inode->i_mapping; loff_t blk_start, blk_end; + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + int ilock; + + f2fs_balance_fs(sbi); blk_start = pg_start << PAGE_CACHE_SHIFT; blk_end = pg_end << PAGE_CACHE_SHIFT; truncate_inode_pages_range(mapping, blk_start, blk_end - 1); + + ilock = mutex_lock_op(sbi); ret = truncate_hole(inode, pg_start, pg_end); + mutex_unlock_op(sbi, ilock); } } @@ -500,13 +492,13 @@ static int expand_inode_data(struct inode *inode, loff_t offset, for (index = pg_start; index <= pg_end; index++) { struct dnode_of_data dn; + int ilock; - mutex_lock_op(sbi, DATA_NEW); - + ilock = mutex_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, index, 0); + ret = get_dnode_of_data(&dn, index, ALLOC_NODE); if (ret) { - mutex_unlock_op(sbi, DATA_NEW); + mutex_unlock_op(sbi, ilock); break; } @@ -514,13 +506,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset, ret = reserve_new_block(&dn); if (ret) { f2fs_put_dnode(&dn); - mutex_unlock_op(sbi, DATA_NEW); + mutex_unlock_op(sbi, ilock); break; } } f2fs_put_dnode(&dn); - - mutex_unlock_op(sbi, DATA_NEW); + mutex_unlock_op(sbi, ilock); if (pg_start == pg_end) new_size = offset + len; @@ -559,6 +550,7 @@ static long f2fs_fallocate(struct file *file, int mode, inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); } + trace_f2fs_fallocate(inode, mode, offset, len, ret); return ret; } @@ -590,7 +582,7 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { unsigned int oldflags; - ret = mnt_want_write(filp->f_path.mnt); + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -627,7 +619,7 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); out: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return ret; } default: diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 94b8a0c48453..14961593e93c 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -11,7 +11,6 @@ #include <linux/fs.h> #include <linux/module.h> #include <linux/backing-dev.h> -#include <linux/proc_fs.h> #include <linux/init.h> #include <linux/f2fs_fs.h> #include <linux/kthread.h> @@ -23,6 +22,7 @@ #include "node.h" #include "segment.h" #include "gc.h" +#include <trace/events/f2fs.h> static struct kmem_cache *winode_slab; @@ -81,9 +81,6 @@ static int gc_thread_func(void *data) /* if return value is not zero, no victim was selected */ if (f2fs_gc(sbi)) wait_ms = GC_THREAD_NOGC_SLEEP_TIME; - else if (wait_ms == GC_THREAD_NOGC_SLEEP_TIME) - wait_ms = GC_THREAD_MAX_SLEEP_TIME; - } while (!kthread_should_stop()); return 0; } @@ -131,7 +128,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - if (p->alloc_mode) { + if (p->alloc_mode == SSR) { p->gc_mode = GC_GREEDY; p->dirty_segmap = dirty_i->dirty_segmap[type]; p->ofs_unit = 1; @@ -160,18 +157,21 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi, static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned int segno; + unsigned int hint = 0; + unsigned int secno; /* * If the gc_type is FG_GC, we can select victim segments * selected by background GC before. * Those segments guarantee they have small valid blocks. */ - segno = find_next_bit(dirty_i->victim_segmap[BG_GC], - TOTAL_SEGS(sbi), 0); - if (segno < TOTAL_SEGS(sbi)) { - clear_bit(segno, dirty_i->victim_segmap[BG_GC]); - return segno; +next: + secno = find_next_bit(dirty_i->victim_secmap, TOTAL_SECS(sbi), hint++); + if (secno < TOTAL_SECS(sbi)) { + if (sec_usage_check(sbi, secno)) + goto next; + clear_bit(secno, dirty_i->victim_secmap); + return secno * sbi->segs_per_sec; } return NULL_SEGNO; } @@ -222,7 +222,7 @@ static unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno, } /* - * This function is called from two pathes. + * This function is called from two paths. * One is garbage collection and the other is SSR segment selection. * When it is called during GC, it just gets a victim segment * and it does not remove it from dirty seglist. @@ -234,7 +234,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct victim_sel_policy p; - unsigned int segno; + unsigned int secno; int nsearched = 0; p.alloc_mode = alloc_mode; @@ -253,6 +253,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, while (1) { unsigned long cost; + unsigned int segno; segno = find_next_bit(p.dirty_segmap, TOTAL_SEGS(sbi), p.offset); @@ -265,13 +266,11 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, break; } p.offset = ((segno / p.ofs_unit) * p.ofs_unit) + p.ofs_unit; + secno = GET_SECNO(sbi, segno); - if (test_bit(segno, dirty_i->victim_segmap[FG_GC])) - continue; - if (gc_type == BG_GC && - test_bit(segno, dirty_i->victim_segmap[BG_GC])) + if (sec_usage_check(sbi, secno)) continue; - if (IS_CURSEC(sbi, GET_SECNO(sbi, segno))) + if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) continue; cost = get_gc_cost(sbi, segno, &p); @@ -291,13 +290,18 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, } got_it: if (p.min_segno != NULL_SEGNO) { - *result = (p.min_segno / p.ofs_unit) * p.ofs_unit; if (p.alloc_mode == LFS) { - int i; - for (i = 0; i < p.ofs_unit; i++) - set_bit(*result + i, - dirty_i->victim_segmap[gc_type]); + secno = GET_SECNO(sbi, p.min_segno); + if (gc_type == FG_GC) + sbi->cur_victim_sec = secno; + else + set_bit(secno, dirty_i->victim_secmap); } + *result = (p.min_segno / p.ofs_unit) * p.ofs_unit; + + trace_f2fs_get_victim(sbi->sb, type, gc_type, &p, + sbi->cur_victim_sec, + prefree_segments(sbi), free_segments(sbi)); } mutex_unlock(&dirty_i->seglist_lock); @@ -381,6 +385,7 @@ static void gc_node_segment(struct f2fs_sb_info *sbi, next_step: entry = sum; + for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { nid_t nid = le32_to_cpu(entry->nid); struct page *node_page; @@ -401,11 +406,18 @@ next_step: continue; /* set page dirty and write it */ - if (!PageWriteback(node_page)) + if (gc_type == FG_GC) { + f2fs_submit_bio(sbi, NODE, true); + wait_on_page_writeback(node_page); set_page_dirty(node_page); + } else { + if (!PageWriteback(node_page)) + set_page_dirty(node_page); + } f2fs_put_page(node_page, 1); stat_inc_node_blk_count(sbi, 1); } + if (initial) { initial = false; goto next_step; @@ -418,6 +430,13 @@ next_step: .for_reclaim = 0, }; sync_node_pages(sbi, 0, &wbc); + + /* + * In the case of FG_GC, it'd be better to reclaim this victim + * completely. + */ + if (get_valid_blocks(sbi, segno, 1) != 0) + goto next_step; } } @@ -481,21 +500,19 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, static void move_data_page(struct inode *inode, struct page *page, int gc_type) { - if (page->mapping != inode->i_mapping) - goto out; - - if (inode != page->mapping->host) - goto out; - - if (PageWriteback(page)) - goto out; - if (gc_type == BG_GC) { + if (PageWriteback(page)) + goto out; set_page_dirty(page); set_cold_data(page); } else { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - mutex_lock_op(sbi, DATA_WRITE); + + if (PageWriteback(page)) { + f2fs_submit_bio(sbi, DATA, true); + wait_on_page_writeback(page); + } + if (clear_page_dirty_for_io(page) && S_ISDIR(inode->i_mode)) { dec_page_count(sbi, F2FS_DIRTY_DENTS); @@ -503,7 +520,6 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type) } set_cold_data(page); do_write_data_page(page); - mutex_unlock_op(sbi, DATA_WRITE); clear_cold_data(page); } out: @@ -530,6 +546,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, next_step: entry = sum; + for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { struct page *data_page; struct inode *inode; @@ -567,7 +584,7 @@ next_step: continue; data_page = find_data_page(inode, - start_bidx + ofs_in_node); + start_bidx + ofs_in_node, false); if (IS_ERR(data_page)) goto next_iput; @@ -588,11 +605,22 @@ next_step: next_iput: iput(inode); } + if (++phase < 4) goto next_step; - if (gc_type == FG_GC) + if (gc_type == FG_GC) { f2fs_submit_bio(sbi, DATA, true); + + /* + * In the case of FG_GC, it'd be better to reclaim this victim + * completely. + */ + if (get_valid_blocks(sbi, segno, 1) != 0) { + phase = 2; + goto next_step; + } + } } static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, @@ -611,18 +639,15 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, { struct page *sum_page; struct f2fs_summary_block *sum; + struct blk_plug plug; /* read segment summary of victim */ sum_page = get_sum_page(sbi, segno); if (IS_ERR(sum_page)) return; - /* - * CP needs to lock sum_page. In this time, we don't need - * to lock this page, because this summary page is not gone anywhere. - * Also, this page is not gonna be updated before GC is done. - */ - unlock_page(sum_page); + blk_start_plug(&plug); + sum = page_address(sum_page); switch (GET_SUM_TYPE((&sum->footer))) { @@ -633,10 +658,12 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, gc_data_segment(sbi, sum->entries, ilist, segno, gc_type); break; } + blk_finish_plug(&plug); + stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer))); stat_inc_call_count(sbi->stat_info); - f2fs_put_page(sum_page, 0); + f2fs_put_page(sum_page, 1); } int f2fs_gc(struct f2fs_sb_info *sbi) @@ -652,8 +679,10 @@ gc_more: if (!(sbi->sb->s_flags & MS_ACTIVE)) goto stop; - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) { gc_type = FG_GC; + write_checkpoint(sbi, false); + } if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE)) goto stop; @@ -662,9 +691,11 @@ gc_more: for (i = 0; i < sbi->segs_per_sec; i++) do_garbage_collect(sbi, segno + i, &ilist, gc_type); - if (gc_type == FG_GC && - get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0) + if (gc_type == FG_GC) { + sbi->cur_victim_sec = NULL_SEGNO; nfree++; + WARN_ON(get_valid_blocks(sbi, segno, sbi->segs_per_sec)); + } if (has_not_enough_free_secs(sbi, nfree)) goto gc_more; diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 30b2db003acd..2c6a6bd08322 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -13,9 +13,9 @@ * whether IO subsystem is idle * or not */ -#define GC_THREAD_MIN_SLEEP_TIME 10000 /* milliseconds */ -#define GC_THREAD_MAX_SLEEP_TIME 30000 -#define GC_THREAD_NOGC_SLEEP_TIME 10000 +#define GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */ +#define GC_THREAD_MAX_SLEEP_TIME 60000 +#define GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */ #define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */ #define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */ @@ -58,6 +58,9 @@ static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi) static inline long increase_sleep_time(long wait) { + if (wait == GC_THREAD_NOGC_SLEEP_TIME) + return wait; + wait += GC_THREAD_MIN_SLEEP_TIME; if (wait > GC_THREAD_MAX_SLEEP_TIME) wait = GC_THREAD_MAX_SLEEP_TIME; @@ -66,6 +69,9 @@ static inline long increase_sleep_time(long wait) static inline long decrease_sleep_time(long wait) { + if (wait == GC_THREAD_NOGC_SLEEP_TIME) + wait = GC_THREAD_MAX_SLEEP_TIME; + wait -= GC_THREAD_MIN_SLEEP_TIME; if (wait <= GC_THREAD_MIN_SLEEP_TIME) wait = GC_THREAD_MIN_SLEEP_TIME; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index ddae412d30c8..91ac7f9d88ee 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -16,6 +16,8 @@ #include "f2fs.h" #include "node.h" +#include <trace/events/f2fs.h> + void f2fs_set_inode_flags(struct inode *inode) { unsigned int flags = F2FS_I(inode)->i_flags; @@ -44,7 +46,11 @@ static int do_read_inode(struct inode *inode) struct f2fs_inode *ri; /* Check if ino is within scope */ - check_nid_range(sbi, inode->i_ino); + if (check_nid_range(sbi, inode->i_ino)) { + f2fs_msg(inode->i_sb, KERN_ERR, "bad inode number: %lu", + (unsigned long) inode->i_ino); + return -EINVAL; + } node_page = get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) @@ -76,7 +82,6 @@ static int do_read_inode(struct inode *inode) fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid); fi->i_flags = le32_to_cpu(ri->i_flags); fi->flags = 0; - fi->data_version = le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver) - 1; fi->i_advise = ri->i_advise; fi->i_pino = le32_to_cpu(ri->i_pino); get_extent_info(&fi->ext, ri->i_ext); @@ -88,13 +93,16 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) { struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; - int ret; + int ret = 0; inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + + if (!(inode->i_state & I_NEW)) { + trace_f2fs_iget(inode); return inode; + } if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi)) goto make_now; @@ -136,11 +144,12 @@ make_now: goto bad_inode; } unlock_new_inode(inode); - + trace_f2fs_iget(inode); return inode; bad_inode: iget_failed(inode); + trace_f2fs_iget_exit(inode, ret); return ERR_PTR(ret); } @@ -192,47 +201,51 @@ void update_inode(struct inode *inode, struct page *node_page) set_page_dirty(node_page); } -int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) +int update_inode_page(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct page *node_page; - bool need_lock = false; - - if (inode->i_ino == F2FS_NODE_INO(sbi) || - inode->i_ino == F2FS_META_INO(sbi)) - return 0; - - if (wbc) - f2fs_balance_fs(sbi); node_page = get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) return PTR_ERR(node_page); - if (!PageDirty(node_page)) { - need_lock = true; - f2fs_put_page(node_page, 1); - mutex_lock(&sbi->write_inode); - node_page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(node_page)) { - mutex_unlock(&sbi->write_inode); - return PTR_ERR(node_page); - } - } update_inode(inode, node_page); f2fs_put_page(node_page, 1); - if (need_lock) - mutex_unlock(&sbi->write_inode); return 0; } +int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + int ret, ilock; + + if (inode->i_ino == F2FS_NODE_INO(sbi) || + inode->i_ino == F2FS_META_INO(sbi)) + return 0; + + if (wbc) + f2fs_balance_fs(sbi); + + /* + * We need to lock here to prevent from producing dirty node pages + * during the urgent cleaning time when runing out of free sections. + */ + ilock = mutex_lock_op(sbi); + ret = update_inode_page(inode); + mutex_unlock_op(sbi, ilock); + return ret; +} + /* * Called at the last iput() if i_nlink is zero */ void f2fs_evict_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + int ilock; + trace_f2fs_evict_inode(inode); truncate_inode_pages(&inode->i_data, 0); if (inode->i_ino == F2FS_NODE_INO(sbi) || @@ -252,7 +265,10 @@ void f2fs_evict_inode(struct inode *inode) if (F2FS_HAS_BLOCKS(inode)) f2fs_truncate(inode); + ilock = mutex_lock_op(sbi); remove_inode_page(inode); + mutex_unlock_op(sbi, ilock); + sb_end_intwrite(inode->i_sb); no_delete: clear_inode(inode); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 1a49b881bac0..47abc9722b17 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -15,8 +15,10 @@ #include <linux/ctype.h> #include "f2fs.h" +#include "node.h" #include "xattr.h" #include "acl.h" +#include <trace/events/f2fs.h> static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) { @@ -25,19 +27,19 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) nid_t ino; struct inode *inode; bool nid_free = false; - int err; + int err, ilock; inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); - mutex_lock_op(sbi, NODE_NEW); + ilock = mutex_lock_op(sbi); if (!alloc_nid(sbi, &ino)) { - mutex_unlock_op(sbi, NODE_NEW); + mutex_unlock_op(sbi, ilock); err = -ENOSPC; goto fail; } - mutex_unlock_op(sbi, NODE_NEW); + mutex_unlock_op(sbi, ilock); inode->i_uid = current_fsuid(); @@ -61,7 +63,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) nid_free = true; goto out; } - + trace_f2fs_new_inode(inode, 0); mark_inode_dirty(inode); return inode; @@ -69,6 +71,8 @@ out: clear_nlink(inode); unlock_new_inode(inode); fail: + trace_f2fs_new_inode(inode, err); + make_bad_inode(inode); iput(inode); if (nid_free) alloc_nid_failed(sbi, ino); @@ -82,7 +86,7 @@ static int is_multimedia_file(const unsigned char *s, const char *sub) int ret; if (sublen > slen) - return 1; + return 0; ret = memcmp(s + slen - sublen, sub, sublen); if (ret) { /* compare upper case */ @@ -90,16 +94,16 @@ static int is_multimedia_file(const unsigned char *s, const char *sub) char upper_sub[8]; for (i = 0; i < sublen && i < sizeof(upper_sub); i++) upper_sub[i] = toupper(sub[i]); - return memcmp(s + slen - sublen, upper_sub, sublen); + return !memcmp(s + slen - sublen, upper_sub, sublen); } - return ret; + return !ret; } /* * Set multimedia files as cold files for hot/cold data separation */ -static inline void set_cold_file(struct f2fs_sb_info *sbi, struct inode *inode, +static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode, const unsigned char *name) { int i; @@ -107,8 +111,8 @@ static inline void set_cold_file(struct f2fs_sb_info *sbi, struct inode *inode, int count = le32_to_cpu(sbi->raw_super->extension_count); for (i = 0; i < count; i++) { - if (!is_multimedia_file(name, extlist[i])) { - F2FS_I(inode)->i_advise |= FADVISE_COLD_BIT; + if (is_multimedia_file(name, extlist[i])) { + set_cold_file(inode); break; } } @@ -121,7 +125,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; nid_t ino = 0; - int err; + int err, ilock; f2fs_balance_fs(sbi); @@ -130,14 +134,16 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, return PTR_ERR(inode); if (!test_opt(sbi, DISABLE_EXT_IDENTIFY)) - set_cold_file(sbi, inode, dentry->d_name.name); + set_cold_files(sbi, inode, dentry->d_name.name); inode->i_op = &f2fs_file_inode_operations; inode->i_fop = &f2fs_file_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; ino = inode->i_ino; + ilock = mutex_lock_op(sbi); err = f2fs_add_link(dentry, inode); + mutex_unlock_op(sbi, ilock); if (err) goto out; @@ -150,6 +156,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, out: clear_nlink(inode); unlock_new_inode(inode); + make_bad_inode(inode); iput(inode); alloc_nid_failed(sbi, ino); return err; @@ -161,7 +168,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, struct inode *inode = old_dentry->d_inode; struct super_block *sb = dir->i_sb; struct f2fs_sb_info *sbi = F2FS_SB(sb); - int err; + int err, ilock; f2fs_balance_fs(sbi); @@ -169,14 +176,23 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, atomic_inc(&inode->i_count); set_inode_flag(F2FS_I(inode), FI_INC_LINK); + ilock = mutex_lock_op(sbi); err = f2fs_add_link(dentry, inode); + mutex_unlock_op(sbi, ilock); if (err) goto out; + /* + * This file should be checkpointed during fsync. + * We lost i_pino from now on. + */ + set_cp_file(inode); + d_instantiate(dentry, inode); return 0; out: clear_inode_flag(F2FS_I(inode), FI_INC_LINK); + make_bad_inode(inode); iput(inode); return err; } @@ -197,7 +213,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, struct f2fs_dir_entry *de; struct page *page; - if (dentry->d_name.len > F2FS_MAX_NAME_LEN) + if (dentry->d_name.len > F2FS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); de = f2fs_find_entry(dir, &dentry->d_name, &page); @@ -222,7 +238,9 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) struct f2fs_dir_entry *de; struct page *page; int err = -ENOENT; + int ilock; + trace_f2fs_unlink_enter(dir, dentry); f2fs_balance_fs(sbi); de = f2fs_find_entry(dir, &dentry->d_name, &page); @@ -236,11 +254,14 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) goto fail; } + ilock = mutex_lock_op(sbi); f2fs_delete_entry(de, page, inode); + mutex_unlock_op(sbi, ilock); /* In order to evict this inode, we set it dirty */ mark_inode_dirty(inode); fail: + trace_f2fs_unlink_exit(inode, err); return err; } @@ -251,7 +272,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; size_t symlen = strlen(symname) + 1; - int err; + int err, ilock; f2fs_balance_fs(sbi); @@ -262,7 +283,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, inode->i_op = &f2fs_symlink_inode_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; + ilock = mutex_lock_op(sbi); err = f2fs_add_link(dentry, inode); + mutex_unlock_op(sbi, ilock); if (err) goto out; @@ -275,6 +298,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, out: clear_nlink(inode); unlock_new_inode(inode); + make_bad_inode(inode); iput(inode); alloc_nid_failed(sbi, inode->i_ino); return err; @@ -284,7 +308,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); struct inode *inode; - int err; + int err, ilock; f2fs_balance_fs(sbi); @@ -298,7 +322,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO); set_inode_flag(F2FS_I(inode), FI_INC_LINK); + ilock = mutex_lock_op(sbi); err = f2fs_add_link(dentry, inode); + mutex_unlock_op(sbi, ilock); if (err) goto out_fail; @@ -313,6 +339,7 @@ out_fail: clear_inode_flag(F2FS_I(inode), FI_INC_LINK); clear_nlink(inode); unlock_new_inode(inode); + make_bad_inode(inode); iput(inode); alloc_nid_failed(sbi, inode->i_ino); return err; @@ -333,6 +360,7 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; int err = 0; + int ilock; if (!new_valid_dev(rdev)) return -EINVAL; @@ -346,7 +374,9 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &f2fs_special_inode_operations; + ilock = mutex_lock_op(sbi); err = f2fs_add_link(dentry, inode); + mutex_unlock_op(sbi, ilock); if (err) goto out; @@ -357,6 +387,7 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, out: clear_nlink(inode); unlock_new_inode(inode); + make_bad_inode(inode); iput(inode); alloc_nid_failed(sbi, inode->i_ino); return err; @@ -374,7 +405,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, struct f2fs_dir_entry *old_dir_entry = NULL; struct f2fs_dir_entry *old_entry; struct f2fs_dir_entry *new_entry; - int err = -ENOENT; + int err = -ENOENT, ilock = -1; f2fs_balance_fs(sbi); @@ -389,7 +420,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_old; } - mutex_lock_op(sbi, RENAME); + ilock = mutex_lock_op(sbi); if (new_inode) { struct page *new_page; @@ -412,7 +443,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, drop_nlink(new_inode); if (!new_inode->i_nlink) add_orphan_inode(sbi, new_inode->i_ino); - f2fs_write_inode(new_inode, NULL); + update_inode_page(new_inode); } else { err = f2fs_add_link(new_dentry, old_inode); if (err) @@ -420,12 +451,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_dir_entry) { inc_nlink(new_dir); - f2fs_write_inode(new_dir, NULL); + update_inode_page(new_dir); } } old_inode->i_ctime = CURRENT_TIME; - set_inode_flag(F2FS_I(old_inode), FI_NEED_CP); mark_inode_dirty(old_inode); f2fs_delete_entry(old_entry, old_page, NULL); @@ -439,10 +469,10 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_put_page(old_dir_page, 0); } drop_nlink(old_dir); - f2fs_write_inode(old_dir, NULL); + update_inode_page(old_dir); } - mutex_unlock_op(sbi, RENAME); + mutex_unlock_op(sbi, ilock); return 0; out_dir: @@ -450,7 +480,7 @@ out_dir: kunmap(old_dir_page); f2fs_put_page(old_dir_page, 0); } - mutex_unlock_op(sbi, RENAME); + mutex_unlock_op(sbi, ilock); out_old: kunmap(old_page); f2fs_put_page(old_page, 0); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index e275218904ed..3df43b4efd89 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -19,6 +19,7 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include <trace/events/f2fs.h> static struct kmem_cache *nat_entry_slab; static struct kmem_cache *free_nid_slab; @@ -88,10 +89,13 @@ static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid) { struct address_space *mapping = sbi->meta_inode->i_mapping; struct f2fs_nm_info *nm_i = NM_I(sbi); + struct blk_plug plug; struct page *page; pgoff_t index; int i; + blk_start_plug(&plug); + for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) { if (nid >= nm_i->max_nid) nid = 0; @@ -100,12 +104,16 @@ static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid) page = grab_cache_page(mapping, index); if (!page) continue; - if (f2fs_readpage(sbi, page, index, READ)) { + if (PageUptodate(page)) { f2fs_put_page(page, 1); continue; } + if (f2fs_readpage(sbi, page, index, READ)) + continue; + f2fs_put_page(page, 0); } + blk_finish_plug(&plug); } static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n) @@ -236,7 +244,7 @@ static int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) { struct f2fs_nm_info *nm_i = NM_I(sbi); - if (nm_i->nat_cnt < 2 * NM_WOUT_THRESHOLD) + if (nm_i->nat_cnt <= NM_WOUT_THRESHOLD) return 0; write_lock(&nm_i->nat_tree_lock); @@ -320,15 +328,14 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4]) noffset[0] = 0; if (block < direct_index) { - offset[n++] = block; - level = 0; + offset[n] = block; goto got; } block -= direct_index; if (block < direct_blks) { offset[n++] = NODE_DIR1_BLOCK; noffset[n] = 1; - offset[n++] = block; + offset[n] = block; level = 1; goto got; } @@ -336,7 +343,7 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4]) if (block < direct_blks) { offset[n++] = NODE_DIR2_BLOCK; noffset[n] = 2; - offset[n++] = block; + offset[n] = block; level = 1; goto got; } @@ -346,7 +353,7 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4]) noffset[n] = 3; offset[n++] = block / direct_blks; noffset[n] = 4 + offset[n - 1]; - offset[n++] = block % direct_blks; + offset[n] = block % direct_blks; level = 2; goto got; } @@ -356,7 +363,7 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4]) noffset[n] = 4 + dptrs_per_blk; offset[n++] = block / direct_blks; noffset[n] = 5 + dptrs_per_blk + offset[n - 1]; - offset[n++] = block % direct_blks; + offset[n] = block % direct_blks; level = 2; goto got; } @@ -371,7 +378,7 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4]) noffset[n] = 7 + (dptrs_per_blk * 2) + offset[n - 2] * (dptrs_per_blk + 1) + offset[n - 1]; - offset[n++] = block % direct_blks; + offset[n] = block % direct_blks; level = 3; goto got; } else { @@ -383,8 +390,11 @@ got: /* * Caller should call f2fs_put_dnode(dn). + * Also, it should grab and release a mutex by calling mutex_lock_op() and + * mutex_unlock_op() only if ro is not set RDONLY_NODE. + * In the case of RDONLY_NODE, we don't need to care about mutex. */ -int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int ro) +int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) { struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); struct page *npage[4]; @@ -403,7 +413,8 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int ro) return PTR_ERR(npage[0]); parent = npage[0]; - nids[1] = get_nid(parent, offset[0], true); + if (level != 0) + nids[1] = get_nid(parent, offset[0], true); dn->inode_page = npage[0]; dn->inode_page_locked = true; @@ -411,12 +422,9 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int ro) for (i = 1; i <= level; i++) { bool done = false; - if (!nids[i] && !ro) { - mutex_lock_op(sbi, NODE_NEW); - + if (!nids[i] && mode == ALLOC_NODE) { /* alloc new node */ if (!alloc_nid(sbi, &(nids[i]))) { - mutex_unlock_op(sbi, NODE_NEW); err = -ENOSPC; goto release_pages; } @@ -425,16 +433,14 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int ro) npage[i] = new_node_page(dn, noffset[i]); if (IS_ERR(npage[i])) { alloc_nid_failed(sbi, nids[i]); - mutex_unlock_op(sbi, NODE_NEW); err = PTR_ERR(npage[i]); goto release_pages; } set_nid(parent, offset[i - 1], nids[i], i == 1); alloc_nid_done(sbi, nids[i]); - mutex_unlock_op(sbi, NODE_NEW); done = true; - } else if (ro && i == level && level > 1) { + } else if (mode == LOOKUP_NODE_RA && i == level && level > 1) { npage[i] = get_node_page_ra(parent, offset[i - 1]); if (IS_ERR(npage[i])) { err = PTR_ERR(npage[i]); @@ -507,6 +513,7 @@ invalidate: f2fs_put_page(dn->node_page, 1); dn->node_page = NULL; + trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr); } static int truncate_dnode(struct dnode_of_data *dn) @@ -547,9 +554,13 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, if (dn->nid == 0) return NIDS_PER_BLOCK + 1; + trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr); + page = get_node_page(sbi, dn->nid); - if (IS_ERR(page)) + if (IS_ERR(page)) { + trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page)); return PTR_ERR(page); + } rn = (struct f2fs_node *)page_address(page); if (depth < 3) { @@ -591,10 +602,12 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, } else { f2fs_put_page(page, 1); } + trace_f2fs_truncate_nodes_exit(dn->inode, freed); return freed; out_err: f2fs_put_page(page, 1); + trace_f2fs_truncate_nodes_exit(dn->inode, ret); return ret; } @@ -649,6 +662,9 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, fail: for (i = depth - 3; i >= 0; i--) f2fs_put_page(pages[i], 1); + + trace_f2fs_truncate_partial_nodes(dn->inode, nid, depth, err); + return err; } @@ -658,6 +674,7 @@ fail: int truncate_inode_blocks(struct inode *inode, pgoff_t from) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct address_space *node_mapping = sbi->node_inode->i_mapping; int err = 0, cont = 1; int level, offset[4], noffset[4]; unsigned int nofs = 0; @@ -665,11 +682,15 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from) struct dnode_of_data dn; struct page *page; - level = get_node_path(from, offset, noffset); + trace_f2fs_truncate_inode_blocks_enter(inode, from); + level = get_node_path(from, offset, noffset); +restart: page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(page)) + if (IS_ERR(page)) { + trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page)); return PTR_ERR(page); + } set_new_dnode(&dn, inode, page, NULL, 0); unlock_page(page); @@ -728,6 +749,10 @@ skip_partial: if (offset[1] == 0 && rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]) { lock_page(page); + if (page->mapping != node_mapping) { + f2fs_put_page(page, 1); + goto restart; + } wait_on_page_writeback(page); rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; set_page_dirty(page); @@ -739,9 +764,14 @@ skip_partial: } fail: f2fs_put_page(page, 0); + trace_f2fs_truncate_inode_blocks_exit(inode, err); return err > 0 ? 0 : err; } +/* + * Caller should grab and release a mutex by calling mutex_lock_op() and + * mutex_unlock_op(). + */ int remove_inode_page(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); @@ -749,21 +779,16 @@ int remove_inode_page(struct inode *inode) nid_t ino = inode->i_ino; struct dnode_of_data dn; - mutex_lock_op(sbi, NODE_TRUNC); page = get_node_page(sbi, ino); - if (IS_ERR(page)) { - mutex_unlock_op(sbi, NODE_TRUNC); + if (IS_ERR(page)) return PTR_ERR(page); - } if (F2FS_I(inode)->i_xattr_nid) { nid_t nid = F2FS_I(inode)->i_xattr_nid; struct page *npage = get_node_page(sbi, nid); - if (IS_ERR(npage)) { - mutex_unlock_op(sbi, NODE_TRUNC); + if (IS_ERR(npage)) return PTR_ERR(npage); - } F2FS_I(inode)->i_xattr_nid = 0; set_new_dnode(&dn, inode, page, npage, nid); @@ -775,23 +800,18 @@ int remove_inode_page(struct inode *inode) BUG_ON(inode->i_blocks != 0 && inode->i_blocks != 1); set_new_dnode(&dn, inode, page, page, ino); truncate_node(&dn); - - mutex_unlock_op(sbi, NODE_TRUNC); return 0; } int new_inode_page(struct inode *inode, const struct qstr *name) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct page *page; struct dnode_of_data dn; /* allocate inode page for new inode */ set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); - mutex_lock_op(sbi, NODE_NEW); page = new_node_page(&dn, 0); init_dent_inode(name, page); - mutex_unlock_op(sbi, NODE_NEW); if (IS_ERR(page)) return PTR_ERR(page); f2fs_put_page(page, 1); @@ -844,6 +864,12 @@ fail: return ERR_PTR(err); } +/* + * Caller should do after getting the following values. + * 0: f2fs_put_page(page, 0) + * LOCKED_PAGE: f2fs_put_page(page, 1) + * error: nothing + */ static int read_node_page(struct page *page, int type) { struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); @@ -851,8 +877,14 @@ static int read_node_page(struct page *page, int type) get_node_info(sbi, page->index, &ni); - if (ni.blk_addr == NULL_ADDR) + if (ni.blk_addr == NULL_ADDR) { + f2fs_put_page(page, 1); return -ENOENT; + } + + if (PageUptodate(page)) + return LOCKED_PAGE; + return f2fs_readpage(sbi, page, ni.blk_addr, type); } @@ -863,40 +895,53 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) { struct address_space *mapping = sbi->node_inode->i_mapping; struct page *apage; + int err; apage = find_get_page(mapping, nid); - if (apage && PageUptodate(apage)) - goto release_out; + if (apage && PageUptodate(apage)) { + f2fs_put_page(apage, 0); + return; + } f2fs_put_page(apage, 0); apage = grab_cache_page(mapping, nid); if (!apage) return; - if (read_node_page(apage, READA)) - unlock_page(apage); - -release_out: - f2fs_put_page(apage, 0); + err = read_node_page(apage, READA); + if (err == 0) + f2fs_put_page(apage, 0); + else if (err == LOCKED_PAGE) + f2fs_put_page(apage, 1); return; } struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) { - int err; - struct page *page; struct address_space *mapping = sbi->node_inode->i_mapping; - + struct page *page; + int err; +repeat: page = grab_cache_page(mapping, nid); if (!page) return ERR_PTR(-ENOMEM); err = read_node_page(page, READ_SYNC); - if (err) { - f2fs_put_page(page, 1); + if (err < 0) return ERR_PTR(err); - } + else if (err == LOCKED_PAGE) + goto got_it; + lock_page(page); + if (!PageUptodate(page)) { + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } + if (page->mapping != mapping) { + f2fs_put_page(page, 1); + goto repeat; + } +got_it: BUG_ON(nid != nid_of_node(page)); mark_page_accessed(page); return page; @@ -910,31 +955,27 @@ struct page *get_node_page_ra(struct page *parent, int start) { struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb); struct address_space *mapping = sbi->node_inode->i_mapping; - int i, end; - int err = 0; - nid_t nid; + struct blk_plug plug; struct page *page; + int err, i, end; + nid_t nid; /* First, try getting the desired direct node. */ nid = get_nid(parent, start, false); if (!nid) return ERR_PTR(-ENOENT); - - page = find_get_page(mapping, nid); - if (page && PageUptodate(page)) - goto page_hit; - f2fs_put_page(page, 0); - repeat: page = grab_cache_page(mapping, nid); if (!page) return ERR_PTR(-ENOMEM); - err = read_node_page(page, READA); - if (err) { - f2fs_put_page(page, 1); + err = read_node_page(page, READ_SYNC); + if (err < 0) return ERR_PTR(err); - } + else if (err == LOCKED_PAGE) + goto page_hit; + + blk_start_plug(&plug); /* Then, try readahead for siblings of the desired node */ end = start + MAX_RA_NODE; @@ -946,18 +987,19 @@ repeat: ra_node_page(sbi, nid); } -page_hit: - lock_page(page); - if (PageError(page)) { - f2fs_put_page(page, 1); - return ERR_PTR(-EIO); - } + blk_finish_plug(&plug); - /* Has the page been truncated? */ + lock_page(page); if (page->mapping != mapping) { f2fs_put_page(page, 1); goto repeat; } +page_hit: + if (!PageUptodate(page)) { + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } + mark_page_accessed(page); return page; } @@ -972,7 +1014,7 @@ void sync_inode_page(struct dnode_of_data *dn) if (!dn->inode_page_locked) unlock_page(dn->inode_page); } else { - f2fs_write_inode(dn->inode, NULL); + update_inode_page(dn->inode); } } @@ -1087,17 +1129,8 @@ static int f2fs_write_node_page(struct page *page, block_t new_addr; struct node_info ni; - if (wbc->for_reclaim) { - dec_page_count(sbi, F2FS_DIRTY_NODES); - wbc->pages_skipped++; - set_page_dirty(page); - return AOP_WRITEPAGE_ACTIVATE; - } - wait_on_page_writeback(page); - mutex_lock_op(sbi, NODE_WRITE); - /* get old block addr of this node page */ nid = nid_of_node(page); BUG_ON(page->index != nid); @@ -1105,17 +1138,25 @@ static int f2fs_write_node_page(struct page *page, get_node_info(sbi, nid, &ni); /* This page is already truncated */ - if (ni.blk_addr == NULL_ADDR) + if (ni.blk_addr == NULL_ADDR) { + dec_page_count(sbi, F2FS_DIRTY_NODES); + unlock_page(page); return 0; + } - set_page_writeback(page); + if (wbc->for_reclaim) { + dec_page_count(sbi, F2FS_DIRTY_NODES); + wbc->pages_skipped++; + set_page_dirty(page); + return AOP_WRITEPAGE_ACTIVATE; + } - /* insert node offset */ + mutex_lock(&sbi->node_write); + set_page_writeback(page); write_node_page(sbi, page, nid, ni.blk_addr, &new_addr); set_node_addr(sbi, &ni, new_addr); dec_page_count(sbi, F2FS_DIRTY_NODES); - - mutex_unlock_op(sbi, NODE_WRITE); + mutex_unlock(&sbi->node_write); unlock_page(page); return 0; } @@ -1130,12 +1171,11 @@ static int f2fs_write_node_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); - struct block_device *bdev = sbi->sb->s_bdev; long nr_to_write = wbc->nr_to_write; /* First check balancing cached NAT entries */ if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK)) { - write_checkpoint(sbi, false); + f2fs_sync_fs(sbi->sb, true); return 0; } @@ -1144,10 +1184,9 @@ static int f2fs_write_node_pages(struct address_space *mapping, return 0; /* if mounting is failed, skip writing node pages */ - wbc->nr_to_write = bio_get_nr_vecs(bdev); + wbc->nr_to_write = max_hw_blocks(sbi); sync_node_pages(sbi, 0, wbc); - wbc->nr_to_write = nr_to_write - - (bio_get_nr_vecs(bdev) - wbc->nr_to_write); + wbc->nr_to_write = nr_to_write - (max_hw_blocks(sbi) - wbc->nr_to_write); return 0; } @@ -1178,7 +1217,7 @@ static void f2fs_invalidate_node_page(struct page *page, unsigned long offset) static int f2fs_release_node_page(struct page *page, gfp_t wait) { ClearPagePrivate(page); - return 0; + return 1; } /* @@ -1195,14 +1234,13 @@ const struct address_space_operations f2fs_node_aops = { static struct free_nid *__lookup_free_nid_list(nid_t n, struct list_head *head) { struct list_head *this; - struct free_nid *i = NULL; + struct free_nid *i; list_for_each(this, head) { i = list_entry(this, struct free_nid, list); if (i->nid == n) - break; - i = NULL; + return i; } - return i; + return NULL; } static void __del_from_free_nid_list(struct free_nid *i) @@ -1211,11 +1249,29 @@ static void __del_from_free_nid_list(struct free_nid *i) kmem_cache_free(free_nid_slab, i); } -static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid) +static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build) { struct free_nid *i; + struct nat_entry *ne; + bool allocated = false; if (nm_i->fcnt > 2 * MAX_FREE_NIDS) + return -1; + + /* 0 nid should not be used */ + if (nid == 0) + return 0; + + if (!build) + goto retry; + + /* do not add allocated nids */ + read_lock(&nm_i->nat_tree_lock); + ne = __lookup_nat_cache(nm_i, nid); + if (ne && nat_get_blkaddr(ne) != NULL_ADDR) + allocated = true; + read_unlock(&nm_i->nat_tree_lock); + if (allocated) return 0; retry: i = kmem_cache_alloc(free_nid_slab, GFP_NOFS); @@ -1250,63 +1306,59 @@ static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid) spin_unlock(&nm_i->free_nid_list_lock); } -static int scan_nat_page(struct f2fs_nm_info *nm_i, +static void scan_nat_page(struct f2fs_nm_info *nm_i, struct page *nat_page, nid_t start_nid) { struct f2fs_nat_block *nat_blk = page_address(nat_page); block_t blk_addr; - int fcnt = 0; int i; - /* 0 nid should not be used */ - if (start_nid == 0) - ++start_nid; - i = start_nid % NAT_ENTRY_PER_BLOCK; for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) { - blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); + + if (start_nid >= nm_i->max_nid) + break; + + blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); BUG_ON(blk_addr == NEW_ADDR); - if (blk_addr == NULL_ADDR) - fcnt += add_free_nid(nm_i, start_nid); + if (blk_addr == NULL_ADDR) { + if (add_free_nid(nm_i, start_nid, true) < 0) + break; + } } - return fcnt; } static void build_free_nids(struct f2fs_sb_info *sbi) { - struct free_nid *fnid, *next_fnid; struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); struct f2fs_summary_block *sum = curseg->sum_blk; - nid_t nid = 0; - bool is_cycled = false; - int fcnt = 0; - int i; + int i = 0; + nid_t nid = nm_i->next_scan_nid; - nid = nm_i->next_scan_nid; - nm_i->init_scan_nid = nid; + /* Enough entries */ + if (nm_i->fcnt > NAT_ENTRY_PER_BLOCK) + return; + /* readahead nat pages to be scanned */ ra_nat_pages(sbi, nid); while (1) { struct page *page = get_current_nat_page(sbi, nid); - fcnt += scan_nat_page(nm_i, page, nid); + scan_nat_page(nm_i, page, nid); f2fs_put_page(page, 1); nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK)); - - if (nid >= nm_i->max_nid) { + if (nid >= nm_i->max_nid) nid = 0; - is_cycled = true; - } - if (fcnt > MAX_FREE_NIDS) - break; - if (is_cycled && nm_i->init_scan_nid <= nid) + + if (i++ == FREE_NID_PAGES) break; } + /* go to the next free nat pages to find free nids abundantly */ nm_i->next_scan_nid = nid; /* find free nids from current sum_pages */ @@ -1315,22 +1367,11 @@ static void build_free_nids(struct f2fs_sb_info *sbi) block_t addr = le32_to_cpu(nat_in_journal(sum, i).block_addr); nid = le32_to_cpu(nid_in_journal(sum, i)); if (addr == NULL_ADDR) - add_free_nid(nm_i, nid); + add_free_nid(nm_i, nid, true); else remove_free_nid(nm_i, nid); } mutex_unlock(&curseg->curseg_mutex); - - /* remove the free nids from current allocated nids */ - list_for_each_entry_safe(fnid, next_fnid, &nm_i->free_nid_list, list) { - struct nat_entry *ne; - - read_lock(&nm_i->nat_tree_lock); - ne = __lookup_nat_cache(nm_i, fnid->nid); - if (ne && nat_get_blkaddr(ne) != NULL_ADDR) - remove_free_nid(nm_i, fnid->nid); - read_unlock(&nm_i->nat_tree_lock); - } } /* @@ -1344,41 +1385,36 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct free_nid *i = NULL; struct list_head *this; retry: - mutex_lock(&nm_i->build_lock); - if (!nm_i->fcnt) { - /* scan NAT in order to build free nid list */ - build_free_nids(sbi); - if (!nm_i->fcnt) { - mutex_unlock(&nm_i->build_lock); - return false; - } - } - mutex_unlock(&nm_i->build_lock); + if (sbi->total_valid_node_count + 1 >= nm_i->max_nid) + return false; - /* - * We check fcnt again since previous check is racy as - * we didn't hold free_nid_list_lock. So other thread - * could consume all of free nids. - */ spin_lock(&nm_i->free_nid_list_lock); - if (!nm_i->fcnt) { - spin_unlock(&nm_i->free_nid_list_lock); - goto retry; - } - BUG_ON(list_empty(&nm_i->free_nid_list)); - list_for_each(this, &nm_i->free_nid_list) { - i = list_entry(this, struct free_nid, list); - if (i->state == NID_NEW) - break; - } + /* We should not use stale free nids created by build_free_nids */ + if (nm_i->fcnt && !sbi->on_build_free_nids) { + BUG_ON(list_empty(&nm_i->free_nid_list)); + list_for_each(this, &nm_i->free_nid_list) { + i = list_entry(this, struct free_nid, list); + if (i->state == NID_NEW) + break; + } - BUG_ON(i->state != NID_NEW); - *nid = i->nid; - i->state = NID_ALLOC; - nm_i->fcnt--; + BUG_ON(i->state != NID_NEW); + *nid = i->nid; + i->state = NID_ALLOC; + nm_i->fcnt--; + spin_unlock(&nm_i->free_nid_list_lock); + return true; + } spin_unlock(&nm_i->free_nid_list_lock); - return true; + + /* Let's scan nat pages and its caches to get free nids */ + mutex_lock(&nm_i->build_lock); + sbi->on_build_free_nids = 1; + build_free_nids(sbi); + sbi->on_build_free_nids = 0; + mutex_unlock(&nm_i->build_lock); + goto retry; } /* @@ -1391,10 +1427,8 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) spin_lock(&nm_i->free_nid_list_lock); i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); - if (i) { - BUG_ON(i->state != NID_ALLOC); - __del_from_free_nid_list(i); - } + BUG_ON(!i || i->state != NID_ALLOC); + __del_from_free_nid_list(i); spin_unlock(&nm_i->free_nid_list_lock); } @@ -1403,8 +1437,19 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) */ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) { - alloc_nid_done(sbi, nid); - add_free_nid(NM_I(sbi), nid); + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct free_nid *i; + + spin_lock(&nm_i->free_nid_list_lock); + i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); + BUG_ON(!i || i->state != NID_ALLOC); + if (nm_i->fcnt > 2 * MAX_FREE_NIDS) { + __del_from_free_nid_list(i); + } else { + i->state = NID_NEW; + nm_i->fcnt++; + } + spin_unlock(&nm_i->free_nid_list_lock); } void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, @@ -1475,23 +1520,24 @@ int restore_node_summary(struct f2fs_sb_info *sbi, sum_entry = &sum->entries[0]; for (i = 0; i < last_offset; i++, sum_entry++) { + /* + * In order to read next node page, + * we must clear PageUptodate flag. + */ + ClearPageUptodate(page); + if (f2fs_readpage(sbi, page, addr, READ_SYNC)) goto out; + lock_page(page); rn = (struct f2fs_node *)page_address(page); sum_entry->nid = rn->footer.nid; sum_entry->version = 0; sum_entry->ofs_in_node = 0; addr++; - - /* - * In order to read next node page, - * we must clear PageUptodate flag. - */ - ClearPageUptodate(page); } -out: unlock_page(page); +out: __free_pages(page, 0); return 0; } @@ -1614,13 +1660,11 @@ flush_now: nid_in_journal(sum, offset) = cpu_to_le32(nid); } - if (nat_get_blkaddr(ne) == NULL_ADDR) { + if (nat_get_blkaddr(ne) == NULL_ADDR && + add_free_nid(NM_I(sbi), nid, false) <= 0) { write_lock(&nm_i->nat_tree_lock); __del_from_nat_cache(nm_i, ne); write_unlock(&nm_i->nat_tree_lock); - - /* We can reuse this freed nid at this point */ - add_free_nid(NM_I(sbi), nid); } else { write_lock(&nm_i->nat_tree_lock); __clear_nat_cache_dirty(nm_i, ne); @@ -1661,19 +1705,16 @@ static int init_node_manager(struct f2fs_sb_info *sbi) spin_lock_init(&nm_i->free_nid_list_lock); rwlock_init(&nm_i->nat_tree_lock); - nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP); - nm_i->init_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); - - nm_i->nat_bitmap = kzalloc(nm_i->bitmap_size, GFP_KERNEL); - if (!nm_i->nat_bitmap) - return -ENOMEM; + nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP); version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP); if (!version_bitmap) return -EFAULT; - /* copy version bitmap */ - memcpy(nm_i->nat_bitmap, version_bitmap, nm_i->bitmap_size); + nm_i->nat_bitmap = kmemdup(version_bitmap, nm_i->bitmap_size, + GFP_KERNEL); + if (!nm_i->nat_bitmap) + return -ENOMEM; return 0; } diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index afdb130f782e..0a2d72f0024d 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -29,6 +29,9 @@ /* vector size for gang look-up from nat cache that consists of radix tree */ #define NATVEC_SIZE 64 +/* return value for read_node_page */ +#define LOCKED_PAGE 1 + /* * For node information */ @@ -239,7 +242,7 @@ static inline bool IS_DNODE(struct page *node_page) return false; if (ofs >= 6 + 2 * NIDS_PER_BLOCK) { ofs -= 6 + 2 * NIDS_PER_BLOCK; - if ((long int)ofs % (NIDS_PER_BLOCK + 1)) + if (!((long int)ofs % (NIDS_PER_BLOCK + 1))) return false; } return true; @@ -277,6 +280,21 @@ static inline int is_cold_file(struct inode *inode) return F2FS_I(inode)->i_advise & FADVISE_COLD_BIT; } +static inline void set_cold_file(struct inode *inode) +{ + F2FS_I(inode)->i_advise |= FADVISE_COLD_BIT; +} + +static inline int is_cp_file(struct inode *inode) +{ + return F2FS_I(inode)->i_advise & FADVISE_CP_BIT; +} + +static inline void set_cp_file(struct inode *inode) +{ + F2FS_I(inode)->i_advise |= FADVISE_CP_BIT; +} + static inline int is_cold_data(struct page *page) { return PageChecked(page); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index b235215ac138..60c8a5097058 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -53,7 +53,7 @@ static int recover_dentry(struct page *ipage, struct inode *inode) dir = f2fs_iget(inode->i_sb, le32_to_cpu(raw_inode->i_pino)); if (IS_ERR(dir)) { - err = -EINVAL; + err = PTR_ERR(dir); goto out; } @@ -112,11 +112,14 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) while (1) { struct fsync_inode_entry *entry; - if (f2fs_readpage(sbi, page, blkaddr, READ_SYNC)) + err = f2fs_readpage(sbi, page, blkaddr, READ_SYNC); + if (err) goto out; + lock_page(page); + if (cp_ver != cpver_of_node(page)) - goto out; + goto unlock_out; if (!is_fsync_dnode(page)) goto next; @@ -129,24 +132,23 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) FI_INC_LINK); } else { if (IS_INODE(page) && is_dent_dnode(page)) { - if (recover_inode_page(sbi, page)) { - err = -ENOMEM; - goto out; - } + err = recover_inode_page(sbi, page); + if (err) + goto unlock_out; } /* add this fsync inode to the list */ entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS); if (!entry) { err = -ENOMEM; - goto out; + goto unlock_out; } entry->inode = f2fs_iget(sbi->sb, ino_of_node(page)); if (IS_ERR(entry->inode)) { err = PTR_ERR(entry->inode); kmem_cache_free(fsync_entry_slab, entry); - goto out; + goto unlock_out; } list_add_tail(&entry->list, head); @@ -154,16 +156,20 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) } if (IS_INODE(page)) { err = recover_inode(entry->inode, page); - if (err) - goto out; + if (err == -ENOENT) { + goto next; + } else if (err) { + err = -EINVAL; + goto unlock_out; + } } next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); - ClearPageUptodate(page); } -out: +unlock_out: unlock_page(page); +out: __free_pages(page, 0); return err; } @@ -232,13 +238,15 @@ static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi, iput(inode); } -static void do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, +static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, struct page *page, block_t blkaddr) { unsigned int start, end; struct dnode_of_data dn; struct f2fs_summary sum; struct node_info ni; + int err = 0; + int ilock; start = start_bidx_of_node(ofs_of_node(page)); if (IS_INODE(page)) @@ -246,9 +254,14 @@ static void do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, else end = start + ADDRS_PER_BLOCK; + ilock = mutex_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); - if (get_dnode_of_data(&dn, start, 0)) - return; + + err = get_dnode_of_data(&dn, start, ALLOC_NODE); + if (err) { + mutex_unlock_op(sbi, ilock); + return err; + } wait_on_page_writeback(dn.node_page); @@ -293,14 +306,17 @@ static void do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr); f2fs_put_dnode(&dn); + mutex_unlock_op(sbi, ilock); + return 0; } -static void recover_data(struct f2fs_sb_info *sbi, +static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head, int type) { unsigned long long cp_ver = le64_to_cpu(sbi->ckpt->checkpoint_ver); struct curseg_info *curseg; struct page *page; + int err = 0; block_t blkaddr; /* get node pages in the current segment */ @@ -310,23 +326,29 @@ static void recover_data(struct f2fs_sb_info *sbi, /* read node page */ page = alloc_page(GFP_NOFS | __GFP_ZERO); if (IS_ERR(page)) - return; + return -ENOMEM; + lock_page(page); while (1) { struct fsync_inode_entry *entry; - if (f2fs_readpage(sbi, page, blkaddr, READ_SYNC)) + err = f2fs_readpage(sbi, page, blkaddr, READ_SYNC); + if (err) goto out; + lock_page(page); + if (cp_ver != cpver_of_node(page)) - goto out; + goto unlock_out; entry = get_fsync_inode(head, ino_of_node(page)); if (!entry) goto next; - do_recover_data(sbi, entry->inode, page, blkaddr); + err = do_recover_data(sbi, entry->inode, page, blkaddr); + if (err) + goto out; if (entry->blkaddr == blkaddr) { iput(entry->inode); @@ -336,28 +358,32 @@ static void recover_data(struct f2fs_sb_info *sbi, next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); - ClearPageUptodate(page); } -out: +unlock_out: unlock_page(page); +out: __free_pages(page, 0); - allocate_new_segments(sbi); + if (!err) + allocate_new_segments(sbi); + return err; } -void recover_fsync_data(struct f2fs_sb_info *sbi) +int recover_fsync_data(struct f2fs_sb_info *sbi) { struct list_head inode_list; + int err; fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", sizeof(struct fsync_inode_entry), NULL); if (unlikely(!fsync_entry_slab)) - return; + return -ENOMEM; INIT_LIST_HEAD(&inode_list); /* step #1: find fsynced inode numbers */ - if (find_fsync_dnodes(sbi, &inode_list)) + err = find_fsync_dnodes(sbi, &inode_list); + if (err) goto out; if (list_empty(&inode_list)) @@ -365,11 +391,12 @@ void recover_fsync_data(struct f2fs_sb_info *sbi) /* step #2: recover data */ sbi->por_doing = 1; - recover_data(sbi, &inode_list, CURSEG_WARM_NODE); + err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE); sbi->por_doing = 0; BUG_ON(!list_empty(&inode_list)); out: destroy_fsync_dnodes(sbi, &inode_list); kmem_cache_destroy(fsync_entry_slab); write_checkpoint(sbi, false); + return err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 777f17e496e6..d8e84e49a5c3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -18,6 +18,7 @@ #include "f2fs.h" #include "segment.h" #include "node.h" +#include <trace/events/f2fs.h> /* * This function balances dirty node and dentry pages. @@ -49,9 +50,20 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, if (dirty_type == DIRTY) { struct seg_entry *sentry = get_seg_entry(sbi, segno); + enum dirty_type t = DIRTY_HOT_DATA; + dirty_type = sentry->type; + if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type])) dirty_i->nr_dirty[dirty_type]++; + + /* Only one bitmap should be set */ + for (; t <= DIRTY_COLD_NODE; t++) { + if (t == dirty_type) + continue; + if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) + dirty_i->nr_dirty[t]--; + } } } @@ -64,13 +76,16 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, dirty_i->nr_dirty[dirty_type]--; if (dirty_type == DIRTY) { - struct seg_entry *sentry = get_seg_entry(sbi, segno); - dirty_type = sentry->type; - if (test_and_clear_bit(segno, - dirty_i->dirty_segmap[dirty_type])) - dirty_i->nr_dirty[dirty_type]--; - clear_bit(segno, dirty_i->victim_segmap[FG_GC]); - clear_bit(segno, dirty_i->victim_segmap[BG_GC]); + enum dirty_type t = DIRTY_HOT_DATA; + + /* clear all the bitmaps */ + for (; t <= DIRTY_COLD_NODE; t++) + if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) + dirty_i->nr_dirty[t]--; + + if (get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0) + clear_bit(GET_SECNO(sbi, segno), + dirty_i->victim_secmap); } } @@ -296,13 +311,12 @@ static void write_sum_page(struct f2fs_sb_info *sbi, f2fs_put_page(page, 1); } -static unsigned int check_prefree_segments(struct f2fs_sb_info *sbi, - int ofs_unit, int type) +static unsigned int check_prefree_segments(struct f2fs_sb_info *sbi, int type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned long *prefree_segmap = dirty_i->dirty_segmap[PRE]; - unsigned int segno, next_segno, i; - int ofs = 0; + unsigned int segno; + unsigned int ofs = 0; /* * If there is not enough reserved sections, @@ -318,28 +332,46 @@ static unsigned int check_prefree_segments(struct f2fs_sb_info *sbi, if (IS_NODESEG(type)) return NULL_SEGNO; next: - segno = find_next_bit(prefree_segmap, TOTAL_SEGS(sbi), ofs++); - ofs = ((segno / ofs_unit) * ofs_unit) + ofs_unit; + segno = find_next_bit(prefree_segmap, TOTAL_SEGS(sbi), ofs); + ofs += sbi->segs_per_sec; + if (segno < TOTAL_SEGS(sbi)) { + int i; + /* skip intermediate segments in a section */ - if (segno % ofs_unit) + if (segno % sbi->segs_per_sec) goto next; - /* skip if whole section is not prefree */ - next_segno = find_next_zero_bit(prefree_segmap, - TOTAL_SEGS(sbi), segno + 1); - if (next_segno - segno < ofs_unit) + /* skip if the section is currently used */ + if (sec_usage_check(sbi, GET_SECNO(sbi, segno))) goto next; + /* skip if whole section is not prefree */ + for (i = 1; i < sbi->segs_per_sec; i++) + if (!test_bit(segno + i, prefree_segmap)) + goto next; + /* skip if whole section was not free at the last checkpoint */ - for (i = 0; i < ofs_unit; i++) - if (get_seg_entry(sbi, segno)->ckpt_valid_blocks) + for (i = 0; i < sbi->segs_per_sec; i++) + if (get_seg_entry(sbi, segno + i)->ckpt_valid_blocks) goto next; + return segno; } return NULL_SEGNO; } +static int is_next_segment_free(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned int segno = curseg->segno; + struct free_segmap_info *free_i = FREE_I(sbi); + + if (segno + 1 < TOTAL_SEGS(sbi) && (segno + 1) % sbi->segs_per_sec) + return !test_bit(segno + 1, free_i->free_segmap); + return 0; +} + /* * Find a new segment from the free segments bitmap to right order * This function should be returned with success, otherwise BUG @@ -348,9 +380,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi, unsigned int *newseg, bool new_sec, int dir) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int total_secs = sbi->total_sections; unsigned int segno, secno, zoneno; - unsigned int total_zones = sbi->total_sections / sbi->secs_per_zone; + unsigned int total_zones = TOTAL_SECS(sbi) / sbi->secs_per_zone; unsigned int hint = *newseg / sbi->segs_per_sec; unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg); unsigned int left_start = hint; @@ -363,16 +394,17 @@ static void get_new_segment(struct f2fs_sb_info *sbi, if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) { segno = find_next_zero_bit(free_i->free_segmap, TOTAL_SEGS(sbi), *newseg + 1); - if (segno < TOTAL_SEGS(sbi)) + if (segno - *newseg < sbi->segs_per_sec - + (*newseg % sbi->segs_per_sec)) goto got_it; } find_other_zone: - secno = find_next_zero_bit(free_i->free_secmap, total_secs, hint); - if (secno >= total_secs) { + secno = find_next_zero_bit(free_i->free_secmap, TOTAL_SECS(sbi), hint); + if (secno >= TOTAL_SECS(sbi)) { if (dir == ALLOC_RIGHT) { secno = find_next_zero_bit(free_i->free_secmap, - total_secs, 0); - BUG_ON(secno >= total_secs); + TOTAL_SECS(sbi), 0); + BUG_ON(secno >= TOTAL_SECS(sbi)); } else { go_left = 1; left_start = hint - 1; @@ -387,8 +419,8 @@ find_other_zone: continue; } left_start = find_next_zero_bit(free_i->free_secmap, - total_secs, 0); - BUG_ON(left_start >= total_secs); + TOTAL_SECS(sbi), 0); + BUG_ON(left_start >= TOTAL_SECS(sbi)); break; } secno = left_start; @@ -561,20 +593,20 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, int type, bool force) { struct curseg_info *curseg = CURSEG_I(sbi, type); - unsigned int ofs_unit; if (force) { new_curseg(sbi, type, true); goto out; } - ofs_unit = need_SSR(sbi) ? 1 : sbi->segs_per_sec; - curseg->next_segno = check_prefree_segments(sbi, ofs_unit, type); + curseg->next_segno = check_prefree_segments(sbi, type); if (curseg->next_segno != NULL_SEGNO) change_curseg(sbi, type, false); else if (type == CURSEG_WARM_NODE) new_curseg(sbi, type, false); + else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) + new_curseg(sbi, type, false); else if (need_SSR(sbi) && get_ssr_segment(sbi, type)) change_curseg(sbi, type, true); else @@ -656,10 +688,16 @@ static void do_submit_bio(struct f2fs_sb_info *sbi, if (type >= META_FLUSH) rw = WRITE_FLUSH_FUA; + if (btype == META) + rw |= REQ_META; + if (sbi->bio[btype]) { struct bio_private *p = sbi->bio[btype]->bi_private; p->sbi = sbi; sbi->bio[btype]->bi_end_io = f2fs_end_io_write; + + trace_f2fs_do_submit_bio(sbi->sb, btype, sync, sbi->bio[btype]); + if (type == META_FLUSH) { DECLARE_COMPLETION_ONSTACK(wait); p->is_sync = true; @@ -696,7 +734,7 @@ static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page, do_submit_bio(sbi, type, false); alloc_new: if (sbi->bio[type] == NULL) { - sbi->bio[type] = f2fs_bio_alloc(bdev, bio_get_nr_vecs(bdev)); + sbi->bio[type] = f2fs_bio_alloc(bdev, max_hw_blocks(sbi)); sbi->bio[type]->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); /* * The end_io will be assigned at the sumbission phase. @@ -714,6 +752,7 @@ alloc_new: sbi->last_block_in_bio[type] = blk_addr; up_write(&sbi->bio_sem); + trace_f2fs_submit_write_page(page, blk_addr, type); } static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) @@ -1390,7 +1429,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) } if (sbi->segs_per_sec > 1) { - sit_i->sec_entries = vzalloc(sbi->total_sections * + sit_i->sec_entries = vzalloc(TOTAL_SECS(sbi) * sizeof(struct sec_entry)); if (!sit_i->sec_entries) return -ENOMEM; @@ -1403,10 +1442,9 @@ static int build_sit_info(struct f2fs_sb_info *sbi) bitmap_size = __bitmap_size(sbi, SIT_BITMAP); src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP); - dst_bitmap = kzalloc(bitmap_size, GFP_KERNEL); + dst_bitmap = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL); if (!dst_bitmap) return -ENOMEM; - memcpy(dst_bitmap, src_bitmap, bitmap_size); /* init SIT information */ sit_i->s_ops = &default_salloc_ops; @@ -1442,7 +1480,7 @@ static int build_free_segmap(struct f2fs_sb_info *sbi) if (!free_i->free_segmap) return -ENOMEM; - sec_bitmap_size = f2fs_bitmap_size(sbi->total_sections); + sec_bitmap_size = f2fs_bitmap_size(TOTAL_SECS(sbi)); free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL); if (!free_i->free_secmap) return -ENOMEM; @@ -1559,14 +1597,13 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) } } -static int init_victim_segmap(struct f2fs_sb_info *sbi) +static int init_victim_secmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); + unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SECS(sbi)); - dirty_i->victim_segmap[FG_GC] = kzalloc(bitmap_size, GFP_KERNEL); - dirty_i->victim_segmap[BG_GC] = kzalloc(bitmap_size, GFP_KERNEL); - if (!dirty_i->victim_segmap[FG_GC] || !dirty_i->victim_segmap[BG_GC]) + dirty_i->victim_secmap = kzalloc(bitmap_size, GFP_KERNEL); + if (!dirty_i->victim_secmap) return -ENOMEM; return 0; } @@ -1593,7 +1630,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi) } init_dirty_segmap(sbi); - return init_victim_segmap(sbi); + return init_victim_secmap(sbi); } /* @@ -1680,18 +1717,10 @@ static void discard_dirty_segmap(struct f2fs_sb_info *sbi, mutex_unlock(&dirty_i->seglist_lock); } -void reset_victim_segmap(struct f2fs_sb_info *sbi) -{ - unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); - memset(DIRTY_I(sbi)->victim_segmap[FG_GC], 0, bitmap_size); -} - -static void destroy_victim_segmap(struct f2fs_sb_info *sbi) +static void destroy_victim_secmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - - kfree(dirty_i->victim_segmap[FG_GC]); - kfree(dirty_i->victim_segmap[BG_GC]); + kfree(dirty_i->victim_secmap); } static void destroy_dirty_segmap(struct f2fs_sb_info *sbi) @@ -1706,7 +1735,7 @@ static void destroy_dirty_segmap(struct f2fs_sb_info *sbi) for (i = 0; i < NR_DIRTY_TYPE; i++) discard_dirty_segmap(sbi, i); - destroy_victim_segmap(sbi); + destroy_victim_secmap(sbi); SM_I(sbi)->dirty_info = NULL; kfree(dirty_i); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 552dadbb2327..062424a0e4c3 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -8,10 +8,13 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ +#include <linux/blkdev.h> + /* constant macro */ #define NULL_SEGNO ((unsigned int)(~0)) +#define NULL_SECNO ((unsigned int)(~0)) -/* V: Logical segment # in volume, R: Relative segment # in main area */ +/* L: Logical segment # in volume, R: Relative segment # in main area */ #define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) #define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno) @@ -23,13 +26,13 @@ ((t == CURSEG_HOT_NODE) || (t == CURSEG_COLD_NODE) || \ (t == CURSEG_WARM_NODE)) -#define IS_CURSEG(sbi, segno) \ - ((segno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ - (segno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ - (segno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ - (segno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ - (segno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ - (segno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) +#define IS_CURSEG(sbi, seg) \ + ((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ + (seg == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ + (seg == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ + (seg == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ + (seg == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ + (seg == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) #define IS_CURSEC(sbi, secno) \ ((secno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ @@ -81,9 +84,12 @@ #define f2fs_bitmap_size(nr) \ (BITS_TO_LONGS(nr) * sizeof(unsigned long)) #define TOTAL_SEGS(sbi) (SM_I(sbi)->main_segments) +#define TOTAL_SECS(sbi) (sbi->total_sections) #define SECTOR_FROM_BLOCK(sbi, blk_addr) \ (blk_addr << ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE)) +#define SECTOR_TO_BLOCK(sbi, sectors) \ + (sectors >> ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE)) /* during checkpoint, bio_private is used to synchronize the last bio */ struct bio_private { @@ -213,7 +219,7 @@ struct dirty_seglist_info { unsigned long *dirty_segmap[NR_DIRTY_TYPE]; struct mutex seglist_lock; /* lock for segment bitmaps */ int nr_dirty[NR_DIRTY_TYPE]; /* # of dirty segments */ - unsigned long *victim_segmap[2]; /* BG_GC, FG_GC */ + unsigned long *victim_secmap; /* background GC victims */ }; /* victim selection function for cleaning and SSR */ @@ -464,8 +470,7 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) static inline int utilization(struct f2fs_sb_info *sbi) { - return (long int)valid_user_blocks(sbi) * 100 / - (long int)sbi->user_block_count; + return div_u64(valid_user_blocks(sbi) * 100, sbi->user_block_count); } /* @@ -616,3 +621,17 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_total_block_count) - (base + 1) + type; } + +static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) +{ + if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno)) + return true; + return false; +} + +static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi) +{ + struct block_device *bdev = sbi->sb->s_bdev; + struct request_queue *q = bdev_get_queue(bdev); + return SECTOR_TO_BLOCK(sbi, queue_max_sectors(q)); +} diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8c117649a035..8555f7df82c7 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -12,7 +12,6 @@ #include <linux/init.h> #include <linux/fs.h> #include <linux/statfs.h> -#include <linux/proc_fs.h> #include <linux/buffer_head.h> #include <linux/backing-dev.h> #include <linux/kthread.h> @@ -21,12 +20,17 @@ #include <linux/seq_file.h> #include <linux/random.h> #include <linux/exportfs.h> +#include <linux/blkdev.h> #include <linux/f2fs_fs.h> #include "f2fs.h" #include "node.h" +#include "segment.h" #include "xattr.h" +#define CREATE_TRACE_POINTS +#include <trace/events/f2fs.h> + static struct kmem_cache *f2fs_inode_cachep; enum { @@ -82,7 +86,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_once((void *) fi); - /* Initilize f2fs-specific inode info */ + /* Initialize f2fs-specific inode info */ fi->vfs_inode.i_version = 1; atomic_set(&fi->dirty_dents, 0); fi->i_current_depth = 1; @@ -94,6 +98,20 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) return &fi->vfs_inode; } +static int f2fs_drop_inode(struct inode *inode) +{ + /* + * This is to avoid a deadlock condition like below. + * writeback_single_inode(inode) + * - f2fs_write_data_page + * - f2fs_gc -> iput -> evict + * - inode_wait_for_writeback(inode) + */ + if (!inode_unhashed(inode) && inode->i_state & I_SYNC) + return 0; + return generic_drop_inode(inode); +} + static void f2fs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); @@ -132,13 +150,18 @@ int f2fs_sync_fs(struct super_block *sb, int sync) { struct f2fs_sb_info *sbi = F2FS_SB(sb); + trace_f2fs_sync_fs(sb, sync); + if (!sbi->s_dirty && !get_pages(sbi, F2FS_DIRTY_NODES)) return 0; - if (sync) + if (sync) { + mutex_lock(&sbi->gc_mutex); write_checkpoint(sbi, false); - else + mutex_unlock(&sbi->gc_mutex); + } else { f2fs_balance_fs(sbi); + } return 0; } @@ -180,7 +203,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = sbi->total_node_count; buf->f_ffree = sbi->total_node_count - valid_inode_count(sbi); - buf->f_namelen = F2FS_MAX_NAME_LEN; + buf->f_namelen = F2FS_NAME_LEN; buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); @@ -223,6 +246,7 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) static struct super_operations f2fs_sops = { .alloc_inode = f2fs_alloc_inode, + .drop_inode = f2fs_drop_inode, .destroy_inode = f2fs_destroy_inode, .write_inode = f2fs_write_inode, .show_options = f2fs_show_options, @@ -457,6 +481,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->root_ino_num = le32_to_cpu(raw_super->root_ino); sbi->node_ino_num = le32_to_cpu(raw_super->node_ino); sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino); + sbi->cur_victim_sec = NULL_SECNO; for (i = 0; i < NR_COUNT_TYPE; i++) atomic_set(&sbi->nr_pages[i], 0); @@ -473,7 +498,7 @@ static int validate_superblock(struct super_block *sb, if (!*raw_super_buf) { f2fs_msg(sb, KERN_ERR, "unable to read %s superblock", super); - return 1; + return -EIO; } *raw_super = (struct f2fs_super_block *) @@ -485,7 +510,7 @@ static int validate_superblock(struct super_block *sb, f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem " "in %s superblock", super); - return 1; + return -EINVAL; } static int f2fs_fill_super(struct super_block *sb, void *data, int silent) @@ -508,9 +533,12 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; } - if (validate_superblock(sb, &raw_super, &raw_super_buf, 0)) { + err = validate_superblock(sb, &raw_super, &raw_super_buf, 0); + if (err) { brelse(raw_super_buf); - if (validate_superblock(sb, &raw_super, &raw_super_buf, 1)) + /* check secondary superblock when primary failed */ + err = validate_superblock(sb, &raw_super, &raw_super_buf, 1); + if (err) goto free_sb_buf; } /* init some FS parameters */ @@ -525,7 +553,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) set_opt(sbi, POSIX_ACL); #endif /* parse mount options */ - if (parse_options(sb, sbi, (char *)data)) + err = parse_options(sb, sbi, (char *)data); + if (err) goto free_sb_buf; sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize)); @@ -547,11 +576,11 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sbi->raw_super = raw_super; sbi->raw_super_buf = raw_super_buf; mutex_init(&sbi->gc_mutex); - mutex_init(&sbi->write_inode); mutex_init(&sbi->writepages); mutex_init(&sbi->cp_mutex); - for (i = 0; i < NR_LOCK_TYPE; i++) + for (i = 0; i < NR_GLOBAL_LOCKS; i++) mutex_init(&sbi->fs_lock[i]); + mutex_init(&sbi->node_write); sbi->por_doing = 0; spin_lock_init(&sbi->stat_lock); init_rwsem(&sbi->bio_sem); @@ -638,8 +667,12 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) } /* recover fsynced data */ - if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) - recover_fsync_data(sbi); + if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { + err = recover_fsync_data(sbi); + if (err) + f2fs_msg(sb, KERN_ERR, + "Cannot recover all fsync data errno=%ld", err); + } /* After POR, we can run background GC thread */ err = start_gc_thread(sbi); @@ -650,6 +683,14 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (err) goto fail; + if (test_opt(sbi, DISCARD)) { + struct request_queue *q = bdev_get_queue(sb->s_bdev); + if (!blk_queue_discard(q)) + f2fs_msg(sb, KERN_WARNING, + "mounting with \"discard\" option, but " + "the device does not support discard"); + } + return 0; fail: stop_gc_thread(sbi); @@ -687,6 +728,7 @@ static struct file_system_type f2fs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("f2fs"); static int __init init_inodecache(void) { diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 8038c0496504..0b02dce31356 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -307,27 +307,30 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, int error, found, free, newsize; size_t name_len; char *pval; + int ilock; if (name == NULL) return -EINVAL; - name_len = strlen(name); if (value == NULL) value_len = 0; - if (name_len > 255 || value_len > MAX_VALUE_LEN) + name_len = strlen(name); + + if (name_len > F2FS_NAME_LEN || value_len > MAX_VALUE_LEN) return -ERANGE; f2fs_balance_fs(sbi); - mutex_lock_op(sbi, NODE_NEW); + ilock = mutex_lock_op(sbi); + if (!fi->i_xattr_nid) { /* Allocate new attribute block */ struct dnode_of_data dn; if (!alloc_nid(sbi, &fi->i_xattr_nid)) { - mutex_unlock_op(sbi, NODE_NEW); - return -ENOSPC; + error = -ENOSPC; + goto exit; } set_new_dnode(&dn, inode, NULL, NULL, fi->i_xattr_nid); mark_inode_dirty(inode); @@ -336,8 +339,8 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, if (IS_ERR(page)) { alloc_nid_failed(sbi, fi->i_xattr_nid); fi->i_xattr_nid = 0; - mutex_unlock_op(sbi, NODE_NEW); - return PTR_ERR(page); + error = PTR_ERR(page); + goto exit; } alloc_nid_done(sbi, fi->i_xattr_nid); @@ -349,8 +352,8 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, /* The inode already has an extended attribute block. */ page = get_node_page(sbi, fi->i_xattr_nid); if (IS_ERR(page)) { - mutex_unlock_op(sbi, NODE_NEW); - return PTR_ERR(page); + error = PTR_ERR(page); + goto exit; } base_addr = page_address(page); @@ -432,12 +435,13 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, inode->i_ctime = CURRENT_TIME; clear_inode_flag(fi, FI_ACL_MODE); } - f2fs_write_inode(inode, NULL); - mutex_unlock_op(sbi, NODE_NEW); + update_inode_page(inode); + mutex_unlock_op(sbi, ilock); return 0; cleanup: f2fs_put_page(page, 1); - mutex_unlock_op(sbi, NODE_NEW); +exit: + mutex_unlock_op(sbi, ilock); return error; } diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 165012ef363a..7a6f02caf286 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -964,6 +964,29 @@ int fat_scan(struct inode *dir, const unsigned char *name, } EXPORT_SYMBOL_GPL(fat_scan); +/* + * Scans a directory for a given logstart. + * Returns an error code or zero. + */ +int fat_scan_logstart(struct inode *dir, int i_logstart, + struct fat_slot_info *sinfo) +{ + struct super_block *sb = dir->i_sb; + + sinfo->slot_off = 0; + sinfo->bh = NULL; + while (fat_get_short_entry(dir, &sinfo->slot_off, &sinfo->bh, + &sinfo->de) >= 0) { + if (fat_get_start(MSDOS_SB(sb), sinfo->de) == i_logstart) { + sinfo->slot_off -= sizeof(*sinfo->de); + sinfo->nr_slots = 1; + sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de); + return 0; + } + } + return -ENOENT; +} + static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots) { struct super_block *sb = dir->i_sb; diff --git a/fs/fat/fat.h b/fs/fat/fat.h index e9cc3f0d58e2..21664fcf3616 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -23,6 +23,9 @@ #define FAT_ERRORS_PANIC 2 /* panic on error */ #define FAT_ERRORS_RO 3 /* remount r/o on error */ +#define FAT_NFS_STALE_RW 1 /* NFS RW support, can cause ESTALE */ +#define FAT_NFS_NOSTALE_RO 2 /* NFS RO support, no ESTALE issue */ + struct fat_mount_options { kuid_t fs_uid; kgid_t fs_gid; @@ -34,6 +37,7 @@ struct fat_mount_options { unsigned short shortname; /* flags for shortname display/create rule */ unsigned char name_check; /* r = relaxed, n = normal, s = strict */ unsigned char errors; /* On error: continue, panic, remount-ro */ + unsigned char nfs; /* NFS support: nostale_ro, stale_rw */ unsigned short allow_utime;/* permission for setting the [am]time */ unsigned quiet:1, /* set = fake successful chmods and chowns */ showexec:1, /* set = only set x bit for com/exe/bat */ @@ -48,8 +52,7 @@ struct fat_mount_options { usefree:1, /* Use free_clusters for FAT32 */ tz_set:1, /* Filesystem timestamps' offset set */ rodir:1, /* allow ATTR_RO for directory */ - discard:1, /* Issue discard requests on deletions */ - nfs:1; /* Do extra work needed for NFS export */ + discard:1; /* Issue discard requests on deletions */ }; #define FAT_HASH_BITS 8 @@ -72,6 +75,7 @@ struct msdos_sb_info { unsigned long root_cluster; /* first cluster of the root directory */ unsigned long fsinfo_sector; /* sector number of FAT32 fsinfo */ struct mutex fat_lock; + struct mutex nfs_build_inode_lock; struct mutex s_lock; unsigned int prev_free; /* previously allocated cluster number */ unsigned int free_clusters; /* -1 if undefined */ @@ -215,6 +219,27 @@ static inline sector_t fat_clus_to_blknr(struct msdos_sb_info *sbi, int clus) + sbi->data_start; } +static inline void fat_get_blknr_offset(struct msdos_sb_info *sbi, + loff_t i_pos, sector_t *blknr, int *offset) +{ + *blknr = i_pos >> sbi->dir_per_block_bits; + *offset = i_pos & (sbi->dir_per_block - 1); +} + +static inline loff_t fat_i_pos_read(struct msdos_sb_info *sbi, + struct inode *inode) +{ + loff_t i_pos; +#if BITS_PER_LONG == 32 + spin_lock(&sbi->inode_hash_lock); +#endif + i_pos = MSDOS_I(inode)->i_pos; +#if BITS_PER_LONG == 32 + spin_unlock(&sbi->inode_hash_lock); +#endif + return i_pos; +} + static inline void fat16_towchar(wchar_t *dst, const __u8 *src, size_t len) { #ifdef __BIG_ENDIAN @@ -271,6 +296,8 @@ extern int fat_dir_empty(struct inode *dir); extern int fat_subdirs(struct inode *dir); extern int fat_scan(struct inode *dir, const unsigned char *name, struct fat_slot_info *sinfo); +extern int fat_scan_logstart(struct inode *dir, int i_logstart, + struct fat_slot_info *sinfo); extern int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh, struct msdos_dir_entry **de); extern int fat_alloc_new_dir(struct inode *dir, struct timespec *ts); @@ -348,6 +375,7 @@ extern struct inode *fat_build_inode(struct super_block *sb, extern int fat_sync_inode(struct inode *inode); extern int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, void (*setup)(struct super_block *)); +extern int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de); extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2); @@ -382,12 +410,8 @@ int fat_cache_init(void); void fat_cache_destroy(void); /* fat/nfs.c */ -struct fid; -extern struct dentry *fat_fh_to_dentry(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type); -extern struct dentry *fat_fh_to_parent(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type); -extern struct dentry *fat_get_parent(struct dentry *child_dir); +extern const struct export_operations fat_export_ops; +extern const struct export_operations fat_export_ops_nostale; /* helper for printk */ typedef unsigned long long llu; diff --git a/fs/fat/file.c b/fs/fat/file.c index 3978f8ca1823..b0b632e50ddb 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -306,6 +306,11 @@ int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) struct inode *inode = dentry->d_inode; generic_fillattr(inode, stat); stat->blksize = MSDOS_SB(inode->i_sb)->cluster_size; + + if (MSDOS_SB(inode->i_sb)->options.nfs == FAT_NFS_NOSTALE_RO) { + /* Use i_pos for ino. This is used as fileid of nfs. */ + stat->ino = fat_i_pos_read(MSDOS_SB(inode->i_sb), inode); + } return 0; } EXPORT_SYMBOL_GPL(fat_getattr); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index acf6e479b443..dfce656ddb33 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -18,8 +18,8 @@ #include <linux/pagemap.h> #include <linux/mpage.h> #include <linux/buffer_head.h> -#include <linux/exportfs.h> #include <linux/mount.h> +#include <linux/aio.h> #include <linux/vfs.h> #include <linux/parser.h> #include <linux/uio.h> @@ -385,7 +385,7 @@ static int fat_calc_dir_size(struct inode *inode) } /* doesn't deal with root inode */ -static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) +int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) { struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); int error; @@ -444,12 +444,25 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) return 0; } +static inline void fat_lock_build_inode(struct msdos_sb_info *sbi) +{ + if (sbi->options.nfs == FAT_NFS_NOSTALE_RO) + mutex_lock(&sbi->nfs_build_inode_lock); +} + +static inline void fat_unlock_build_inode(struct msdos_sb_info *sbi) +{ + if (sbi->options.nfs == FAT_NFS_NOSTALE_RO) + mutex_unlock(&sbi->nfs_build_inode_lock); +} + struct inode *fat_build_inode(struct super_block *sb, struct msdos_dir_entry *de, loff_t i_pos) { struct inode *inode; int err; + fat_lock_build_inode(MSDOS_SB(sb)); inode = fat_iget(sb, i_pos); if (inode) goto out; @@ -469,6 +482,7 @@ struct inode *fat_build_inode(struct super_block *sb, fat_attach(inode, i_pos); insert_inode_hash(inode); out: + fat_unlock_build_inode(MSDOS_SB(sb)); return inode; } @@ -655,20 +669,6 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -static inline loff_t fat_i_pos_read(struct msdos_sb_info *sbi, - struct inode *inode) -{ - loff_t i_pos; -#if BITS_PER_LONG == 32 - spin_lock(&sbi->inode_hash_lock); -#endif - i_pos = MSDOS_I(inode)->i_pos; -#if BITS_PER_LONG == 32 - spin_unlock(&sbi->inode_hash_lock); -#endif - return i_pos; -} - static int __fat_write_inode(struct inode *inode, int wait) { struct super_block *sb = inode->i_sb; @@ -676,7 +676,8 @@ static int __fat_write_inode(struct inode *inode, int wait) struct buffer_head *bh; struct msdos_dir_entry *raw_entry; loff_t i_pos; - int err; + sector_t blocknr; + int err, offset; if (inode->i_ino == MSDOS_ROOT_INO) return 0; @@ -686,7 +687,8 @@ retry: if (!i_pos) return 0; - bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits); + fat_get_blknr_offset(sbi, i_pos, &blocknr, &offset); + bh = sb_bread(sb, blocknr); if (!bh) { fat_msg(sb, KERN_ERR, "unable to read inode block " "for updating (i_pos %lld)", i_pos); @@ -699,8 +701,7 @@ retry: goto retry; } - raw_entry = &((struct msdos_dir_entry *) (bh->b_data)) - [i_pos & (sbi->dir_per_block - 1)]; + raw_entry = &((struct msdos_dir_entry *) (bh->b_data))[offset]; if (S_ISDIR(inode->i_mode)) raw_entry->size = 0; else @@ -761,12 +762,6 @@ static const struct super_operations fat_sops = { .show_options = fat_show_options, }; -static const struct export_operations fat_export_ops = { - .fh_to_dentry = fat_fh_to_dentry, - .fh_to_parent = fat_fh_to_parent, - .get_parent = fat_get_parent, -}; - static int fat_show_options(struct seq_file *m, struct dentry *root) { struct msdos_sb_info *sbi = MSDOS_SB(root->d_sb); @@ -814,8 +809,6 @@ static int fat_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",usefree"); if (opts->quiet) seq_puts(m, ",quiet"); - if (opts->nfs) - seq_puts(m, ",nfs"); if (opts->showexec) seq_puts(m, ",showexec"); if (opts->sys_immutable) @@ -849,6 +842,10 @@ static int fat_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",errors=panic"); else seq_puts(m, ",errors=remount-ro"); + if (opts->nfs == FAT_NFS_NOSTALE_RO) + seq_puts(m, ",nfs=nostale_ro"); + else if (opts->nfs) + seq_puts(m, ",nfs=stale_rw"); if (opts->discard) seq_puts(m, ",discard"); @@ -865,7 +862,7 @@ enum { Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes, Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont, Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_time_offset, - Opt_err, + Opt_nfs_stale_rw, Opt_nfs_nostale_ro, Opt_err, }; static const match_table_t fat_tokens = { @@ -895,7 +892,9 @@ static const match_table_t fat_tokens = { {Opt_err_panic, "errors=panic"}, {Opt_err_ro, "errors=remount-ro"}, {Opt_discard, "discard"}, - {Opt_nfs, "nfs"}, + {Opt_nfs_stale_rw, "nfs"}, + {Opt_nfs_stale_rw, "nfs=stale_rw"}, + {Opt_nfs_nostale_ro, "nfs=nostale_ro"}, {Opt_obsolete, "conv=binary"}, {Opt_obsolete, "conv=text"}, {Opt_obsolete, "conv=auto"}, @@ -1092,6 +1091,12 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat, case Opt_err_ro: opts->errors = FAT_ERRORS_RO; break; + case Opt_nfs_stale_rw: + opts->nfs = FAT_NFS_STALE_RW; + break; + case Opt_nfs_nostale_ro: + opts->nfs = FAT_NFS_NOSTALE_RO; + break; /* msdos specific */ case Opt_dots: @@ -1150,9 +1155,6 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat, case Opt_discard: opts->discard = 1; break; - case Opt_nfs: - opts->nfs = 1; - break; /* obsolete mount options */ case Opt_obsolete: @@ -1183,6 +1185,10 @@ out: opts->allow_utime = ~opts->fs_dmask & (S_IWGRP | S_IWOTH); if (opts->unicode_xlate) opts->utf8 = 0; + if (opts->nfs == FAT_NFS_NOSTALE_RO) { + sb->s_flags |= MS_RDONLY; + sb->s_export_op = &fat_export_ops_nostale; + } return 0; } @@ -1193,7 +1199,7 @@ static int fat_read_root(struct inode *inode) struct msdos_sb_info *sbi = MSDOS_SB(sb); int error; - MSDOS_I(inode)->i_pos = 0; + MSDOS_I(inode)->i_pos = MSDOS_ROOT_INO; inode->i_uid = sbi->options.fs_uid; inode->i_gid = sbi->options.fs_gid; inode->i_version++; @@ -1256,6 +1262,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, sb->s_magic = MSDOS_SUPER_MAGIC; sb->s_op = &fat_sops; sb->s_export_op = &fat_export_ops; + mutex_init(&sbi->nfs_build_inode_lock); ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index e2cfda94a28d..081b759cff83 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -668,6 +668,7 @@ static struct file_system_type msdos_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("msdos"); static int __init init_msdos_fs(void) { diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index ac959d655e7d..2da952036a3d 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -1073,6 +1073,7 @@ static struct file_system_type vfat_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("vfat"); static int __init init_vfat_fs(void) { diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c index 499c10438ca2..93e14933dcb6 100644 --- a/fs/fat/nfs.c +++ b/fs/fat/nfs.c @@ -14,6 +14,18 @@ #include <linux/exportfs.h> #include "fat.h" +struct fat_fid { + u32 i_gen; + u32 i_pos_low; + u16 i_pos_hi; + u16 parent_i_pos_hi; + u32 parent_i_pos_low; + u32 parent_i_gen; +}; + +#define FAT_FID_SIZE_WITHOUT_PARENT 3 +#define FAT_FID_SIZE_WITH_PARENT (sizeof(struct fat_fid)/sizeof(u32)) + /** * Look up a directory inode given its starting cluster. */ @@ -38,63 +50,252 @@ static struct inode *fat_dget(struct super_block *sb, int i_logstart) return inode; } -static struct inode *fat_nfs_get_inode(struct super_block *sb, - u64 ino, u32 generation) +static struct inode *fat_ilookup(struct super_block *sb, u64 ino, loff_t i_pos) { - struct inode *inode; + if (MSDOS_SB(sb)->options.nfs == FAT_NFS_NOSTALE_RO) + return fat_iget(sb, i_pos); - if ((ino < MSDOS_ROOT_INO) || (ino == MSDOS_FSINFO_INO)) - return NULL; + else { + if ((ino < MSDOS_ROOT_INO) || (ino == MSDOS_FSINFO_INO)) + return NULL; + return ilookup(sb, ino); + } +} + +static struct inode *__fat_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation, loff_t i_pos) +{ + struct inode *inode = fat_ilookup(sb, ino, i_pos); - inode = ilookup(sb, ino); if (inode && generation && (inode->i_generation != generation)) { iput(inode); inode = NULL; } + if (inode == NULL && MSDOS_SB(sb)->options.nfs == FAT_NFS_NOSTALE_RO) { + struct buffer_head *bh = NULL; + struct msdos_dir_entry *de ; + sector_t blocknr; + int offset; + fat_get_blknr_offset(MSDOS_SB(sb), i_pos, &blocknr, &offset); + bh = sb_bread(sb, blocknr); + if (!bh) { + fat_msg(sb, KERN_ERR, + "unable to read block(%llu) for building NFS inode", + (llu)blocknr); + return inode; + } + de = (struct msdos_dir_entry *)bh->b_data; + /* If a file is deleted on server and client is not updated + * yet, we must not build the inode upon a lookup call. + */ + if (IS_FREE(de[offset].name)) + inode = NULL; + else + inode = fat_build_inode(sb, &de[offset], i_pos); + brelse(bh); + } return inode; } +static struct inode *fat_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + + return __fat_nfs_get_inode(sb, ino, generation, 0); +} + +static int +fat_encode_fh_nostale(struct inode *inode, __u32 *fh, int *lenp, + struct inode *parent) +{ + int len = *lenp; + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + struct fat_fid *fid = (struct fat_fid *) fh; + loff_t i_pos; + int type = FILEID_FAT_WITHOUT_PARENT; + + if (parent) { + if (len < FAT_FID_SIZE_WITH_PARENT) { + *lenp = FAT_FID_SIZE_WITH_PARENT; + return FILEID_INVALID; + } + } else { + if (len < FAT_FID_SIZE_WITHOUT_PARENT) { + *lenp = FAT_FID_SIZE_WITHOUT_PARENT; + return FILEID_INVALID; + } + } + + i_pos = fat_i_pos_read(sbi, inode); + *lenp = FAT_FID_SIZE_WITHOUT_PARENT; + fid->i_gen = inode->i_generation; + fid->i_pos_low = i_pos & 0xFFFFFFFF; + fid->i_pos_hi = (i_pos >> 32) & 0xFFFF; + if (parent) { + i_pos = fat_i_pos_read(sbi, parent); + fid->parent_i_pos_hi = (i_pos >> 32) & 0xFFFF; + fid->parent_i_pos_low = i_pos & 0xFFFFFFFF; + fid->parent_i_gen = parent->i_generation; + type = FILEID_FAT_WITH_PARENT; + *lenp = FAT_FID_SIZE_WITH_PARENT; + } + + return type; +} + /** * Map a NFS file handle to a corresponding dentry. * The dentry may or may not be connected to the filesystem root. */ -struct dentry *fat_fh_to_dentry(struct super_block *sb, struct fid *fid, +static struct dentry *fat_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_dentry(sb, fid, fh_len, fh_type, fat_nfs_get_inode); } +static struct dentry *fat_fh_to_dentry_nostale(struct super_block *sb, + struct fid *fh, int fh_len, + int fh_type) +{ + struct inode *inode = NULL; + struct fat_fid *fid = (struct fat_fid *)fh; + loff_t i_pos; + + switch (fh_type) { + case FILEID_FAT_WITHOUT_PARENT: + if (fh_len < FAT_FID_SIZE_WITHOUT_PARENT) + return NULL; + break; + case FILEID_FAT_WITH_PARENT: + if (fh_len < FAT_FID_SIZE_WITH_PARENT) + return NULL; + break; + default: + return NULL; + } + i_pos = fid->i_pos_hi; + i_pos = (i_pos << 32) | (fid->i_pos_low); + inode = __fat_nfs_get_inode(sb, 0, fid->i_gen, i_pos); + + return d_obtain_alias(inode); +} + /* * Find the parent for a file specified by NFS handle. * This requires that the handle contain the i_ino of the parent. */ -struct dentry *fat_fh_to_parent(struct super_block *sb, struct fid *fid, +static struct dentry *fat_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_parent(sb, fid, fh_len, fh_type, fat_nfs_get_inode); } +static struct dentry *fat_fh_to_parent_nostale(struct super_block *sb, + struct fid *fh, int fh_len, + int fh_type) +{ + struct inode *inode = NULL; + struct fat_fid *fid = (struct fat_fid *)fh; + loff_t i_pos; + + if (fh_len < FAT_FID_SIZE_WITH_PARENT) + return NULL; + + switch (fh_type) { + case FILEID_FAT_WITH_PARENT: + i_pos = fid->parent_i_pos_hi; + i_pos = (i_pos << 32) | (fid->parent_i_pos_low); + inode = __fat_nfs_get_inode(sb, 0, fid->parent_i_gen, i_pos); + break; + } + + return d_obtain_alias(inode); +} + +/* + * Rebuild the parent for a directory that is not connected + * to the filesystem root + */ +static +struct inode *fat_rebuild_parent(struct super_block *sb, int parent_logstart) +{ + int search_clus, clus_to_match; + struct msdos_dir_entry *de; + struct inode *parent = NULL; + struct inode *dummy_grand_parent = NULL; + struct fat_slot_info sinfo; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + sector_t blknr = fat_clus_to_blknr(sbi, parent_logstart); + struct buffer_head *parent_bh = sb_bread(sb, blknr); + if (!parent_bh) { + fat_msg(sb, KERN_ERR, + "unable to read cluster of parent directory"); + return NULL; + } + + de = (struct msdos_dir_entry *) parent_bh->b_data; + clus_to_match = fat_get_start(sbi, &de[0]); + search_clus = fat_get_start(sbi, &de[1]); + + dummy_grand_parent = fat_dget(sb, search_clus); + if (!dummy_grand_parent) { + dummy_grand_parent = new_inode(sb); + if (!dummy_grand_parent) { + brelse(parent_bh); + return parent; + } + + dummy_grand_parent->i_ino = iunique(sb, MSDOS_ROOT_INO); + fat_fill_inode(dummy_grand_parent, &de[1]); + MSDOS_I(dummy_grand_parent)->i_pos = -1; + } + + if (!fat_scan_logstart(dummy_grand_parent, clus_to_match, &sinfo)) + parent = fat_build_inode(sb, sinfo.de, sinfo.i_pos); + + brelse(parent_bh); + iput(dummy_grand_parent); + + return parent; +} + /* * Find the parent for a directory that is not currently connected to * the filesystem root. * * On entry, the caller holds child_dir->d_inode->i_mutex. */ -struct dentry *fat_get_parent(struct dentry *child_dir) +static struct dentry *fat_get_parent(struct dentry *child_dir) { struct super_block *sb = child_dir->d_sb; struct buffer_head *bh = NULL; struct msdos_dir_entry *de; struct inode *parent_inode = NULL; + struct msdos_sb_info *sbi = MSDOS_SB(sb); if (!fat_get_dotdot_entry(child_dir->d_inode, &bh, &de)) { - int parent_logstart = fat_get_start(MSDOS_SB(sb), de); + int parent_logstart = fat_get_start(sbi, de); parent_inode = fat_dget(sb, parent_logstart); + if (!parent_inode && sbi->options.nfs == FAT_NFS_NOSTALE_RO) + parent_inode = fat_rebuild_parent(sb, parent_logstart); } brelse(bh); return d_obtain_alias(parent_inode); } + +const struct export_operations fat_export_ops = { + .fh_to_dentry = fat_fh_to_dentry, + .fh_to_parent = fat_fh_to_parent, + .get_parent = fat_get_parent, +}; + +const struct export_operations fat_export_ops_nostale = { + .encode_fh = fat_encode_fh_nostale, + .fh_to_dentry = fat_fh_to_dentry_nostale, + .fh_to_parent = fat_fh_to_parent_nostale, + .get_parent = fat_get_parent, +}; diff --git a/fs/fifo.c b/fs/fifo.c deleted file mode 100644 index cf6f4345ceb0..000000000000 --- a/fs/fifo.c +++ /dev/null @@ -1,153 +0,0 @@ -/* - * linux/fs/fifo.c - * - * written by Paul H. Hargrove - * - * Fixes: - * 10-06-1999, AV: fixed OOM handling in fifo_open(), moved - * initialization there, switched to external - * allocation of pipe_inode_info. - */ - -#include <linux/mm.h> -#include <linux/fs.h> -#include <linux/sched.h> -#include <linux/pipe_fs_i.h> - -static int wait_for_partner(struct inode* inode, unsigned int *cnt) -{ - int cur = *cnt; - - while (cur == *cnt) { - pipe_wait(inode->i_pipe); - if (signal_pending(current)) - break; - } - return cur == *cnt ? -ERESTARTSYS : 0; -} - -static void wake_up_partner(struct inode* inode) -{ - wake_up_interruptible(&inode->i_pipe->wait); -} - -static int fifo_open(struct inode *inode, struct file *filp) -{ - struct pipe_inode_info *pipe; - int ret; - - mutex_lock(&inode->i_mutex); - pipe = inode->i_pipe; - if (!pipe) { - ret = -ENOMEM; - pipe = alloc_pipe_info(inode); - if (!pipe) - goto err_nocleanup; - inode->i_pipe = pipe; - } - filp->f_version = 0; - - /* We can only do regular read/write on fifos */ - filp->f_mode &= (FMODE_READ | FMODE_WRITE); - - switch (filp->f_mode) { - case FMODE_READ: - /* - * O_RDONLY - * POSIX.1 says that O_NONBLOCK means return with the FIFO - * opened, even when there is no process writing the FIFO. - */ - filp->f_op = &read_pipefifo_fops; - pipe->r_counter++; - if (pipe->readers++ == 0) - wake_up_partner(inode); - - if (!pipe->writers) { - if ((filp->f_flags & O_NONBLOCK)) { - /* suppress POLLHUP until we have - * seen a writer */ - filp->f_version = pipe->w_counter; - } else { - if (wait_for_partner(inode, &pipe->w_counter)) - goto err_rd; - } - } - break; - - case FMODE_WRITE: - /* - * O_WRONLY - * POSIX.1 says that O_NONBLOCK means return -1 with - * errno=ENXIO when there is no process reading the FIFO. - */ - ret = -ENXIO; - if ((filp->f_flags & O_NONBLOCK) && !pipe->readers) - goto err; - - filp->f_op = &write_pipefifo_fops; - pipe->w_counter++; - if (!pipe->writers++) - wake_up_partner(inode); - - if (!pipe->readers) { - if (wait_for_partner(inode, &pipe->r_counter)) - goto err_wr; - } - break; - - case FMODE_READ | FMODE_WRITE: - /* - * O_RDWR - * POSIX.1 leaves this case "undefined" when O_NONBLOCK is set. - * This implementation will NEVER block on a O_RDWR open, since - * the process can at least talk to itself. - */ - filp->f_op = &rdwr_pipefifo_fops; - - pipe->readers++; - pipe->writers++; - pipe->r_counter++; - pipe->w_counter++; - if (pipe->readers == 1 || pipe->writers == 1) - wake_up_partner(inode); - break; - - default: - ret = -EINVAL; - goto err; - } - - /* Ok! */ - mutex_unlock(&inode->i_mutex); - return 0; - -err_rd: - if (!--pipe->readers) - wake_up_interruptible(&pipe->wait); - ret = -ERESTARTSYS; - goto err; - -err_wr: - if (!--pipe->writers) - wake_up_interruptible(&pipe->wait); - ret = -ERESTARTSYS; - goto err; - -err: - if (!pipe->readers && !pipe->writers) - free_pipe_info(inode); - -err_nocleanup: - mutex_unlock(&inode->i_mutex); - return ret; -} - -/* - * Dummy default file-operations: the only thing this does - * is contain the open that then fills in the correct operations - * depending on the access mode of the file... - */ -const struct file_operations def_fifo_fops = { - .open = fifo_open, /* will set read_ or write_pipefifo_fops */ - .llseek = noop_llseek, -}; diff --git a/fs/file.c b/fs/file.c index 3906d9577a18..4a78f981557a 100644 --- a/fs/file.c +++ b/fs/file.c @@ -23,24 +23,10 @@ #include <linux/rcupdate.h> #include <linux/workqueue.h> -struct fdtable_defer { - spinlock_t lock; - struct work_struct wq; - struct fdtable *next; -}; - int sysctl_nr_open __read_mostly = 1024*1024; int sysctl_nr_open_min = BITS_PER_LONG; int sysctl_nr_open_max = 1024 * 1024; /* raised later */ -/* - * We use this list to defer free fdtables that have vmalloced - * sets/arrays. By keeping a per-cpu list, we avoid having to embed - * the work_struct in fdtable itself which avoids a 64 byte (i386) increase in - * this per-task structure. - */ -static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list); - static void *alloc_fdmem(size_t size) { /* @@ -67,46 +53,9 @@ static void __free_fdtable(struct fdtable *fdt) kfree(fdt); } -static void free_fdtable_work(struct work_struct *work) -{ - struct fdtable_defer *f = - container_of(work, struct fdtable_defer, wq); - struct fdtable *fdt; - - spin_lock_bh(&f->lock); - fdt = f->next; - f->next = NULL; - spin_unlock_bh(&f->lock); - while(fdt) { - struct fdtable *next = fdt->next; - - __free_fdtable(fdt); - fdt = next; - } -} - static void free_fdtable_rcu(struct rcu_head *rcu) { - struct fdtable *fdt = container_of(rcu, struct fdtable, rcu); - struct fdtable_defer *fddef; - - BUG_ON(!fdt); - BUG_ON(fdt->max_fds <= NR_OPEN_DEFAULT); - - if (!is_vmalloc_addr(fdt->fd) && !is_vmalloc_addr(fdt->open_fds)) { - kfree(fdt->fd); - kfree(fdt->open_fds); - kfree(fdt); - } else { - fddef = &get_cpu_var(fdtable_defer_list); - spin_lock(&fddef->lock); - fdt->next = fddef->next; - fddef->next = fdt; - /* vmallocs are handled from the workqueue context */ - schedule_work(&fddef->wq); - spin_unlock(&fddef->lock); - put_cpu_var(fdtable_defer_list); - } + __free_fdtable(container_of(rcu, struct fdtable, rcu)); } /* @@ -174,7 +123,6 @@ static struct fdtable * alloc_fdtable(unsigned int nr) fdt->open_fds = data; data += nr / BITS_PER_BYTE; fdt->close_on_exec = data; - fdt->next = NULL; return fdt; @@ -221,7 +169,7 @@ static int expand_fdtable(struct files_struct *files, int nr) /* Continue as planned */ copy_fdtable(new_fdt, cur_fdt); rcu_assign_pointer(files->fdt, new_fdt); - if (cur_fdt->max_fds > NR_OPEN_DEFAULT) + if (cur_fdt != &files->fdtab) call_rcu(&cur_fdt->rcu, free_fdtable_rcu); } else { /* Somebody else expanded, so undo our attempt */ @@ -316,7 +264,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) new_fdt->close_on_exec = newf->close_on_exec_init; new_fdt->open_fds = newf->open_fds_init; new_fdt->fd = &newf->fd_array[0]; - new_fdt->next = NULL; spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); @@ -490,19 +437,8 @@ void exit_files(struct task_struct *tsk) } } -static void fdtable_defer_list_init(int cpu) -{ - struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu); - spin_lock_init(&fddef->lock); - INIT_WORK(&fddef->wq, free_fdtable_work); - fddef->next = NULL; -} - void __init files_defer_init(void) { - int i; - for_each_possible_cpu(i) - fdtable_defer_list_init(i); sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG; } diff --git a/fs/filesystems.c b/fs/filesystems.c index da165f6adcbf..92567d95ba6a 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -273,7 +273,7 @@ struct file_system_type *get_fs_type(const char *name) int len = dot ? dot - name : strlen(name); fs = __get_fs_type(name, len); - if (!fs && (request_module("%.*s", len, name) == 0)) + if (!fs && (request_module("fs-%.*s", len, name) == 0)) fs = __get_fs_type(name, len); if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) { diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index fed2c8afb3a9..e37eb274e492 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -52,7 +52,6 @@ MODULE_AUTHOR("Christoph Hellwig"); MODULE_DESCRIPTION("Veritas Filesystem (VxFS) driver"); MODULE_LICENSE("Dual BSD/GPL"); -MODULE_ALIAS("vxfs"); /* makes mount -t vxfs autoload the module */ static void vxfs_put_super(struct super_block *); @@ -258,6 +257,8 @@ static struct file_system_type vxfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("vxfs"); /* makes mount -t vxfs autoload the module */ +MODULE_ALIAS("vxfs"); static int __init vxfs_init(void) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 21f46fb3a101..3be57189efd5 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -22,7 +22,6 @@ #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/kthread.h> -#include <linux/freezer.h> #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> @@ -88,20 +87,6 @@ static inline struct inode *wb_inode(struct list_head *head) #define CREATE_TRACE_POINTS #include <trace/events/writeback.h> -/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */ -static void bdi_wakeup_flusher(struct backing_dev_info *bdi) -{ - if (bdi->wb.task) { - wake_up_process(bdi->wb.task); - } else { - /* - * The bdi thread isn't there, wake up the forker thread which - * will create and run it. - */ - wake_up_process(default_backing_dev_info.wb.task); - } -} - static void bdi_queue_work(struct backing_dev_info *bdi, struct wb_writeback_work *work) { @@ -109,10 +94,9 @@ static void bdi_queue_work(struct backing_dev_info *bdi, spin_lock_bh(&bdi->wb_lock); list_add_tail(&work->list, &bdi->work_list); - if (!bdi->wb.task) - trace_writeback_nothread(bdi, work); - bdi_wakeup_flusher(bdi); spin_unlock_bh(&bdi->wb_lock); + + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); } static void @@ -127,10 +111,8 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, */ work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) { - if (bdi->wb.task) { - trace_writeback_nowork(bdi); - wake_up_process(bdi->wb.task); - } + trace_writeback_nowork(bdi); + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); return; } @@ -177,9 +159,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) * writeback as soon as there is no other work to do. */ trace_writeback_wake_background(bdi); - spin_lock_bh(&bdi->wb_lock); - bdi_wakeup_flusher(bdi); - spin_unlock_bh(&bdi->wb_lock); + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); } /* @@ -1020,66 +1000,49 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) /* * Handle writeback of dirty data for the device backed by this bdi. Also - * wakes up periodically and does kupdated style flushing. + * reschedules periodically and does kupdated style flushing. */ -int bdi_writeback_thread(void *data) +void bdi_writeback_workfn(struct work_struct *work) { - struct bdi_writeback *wb = data; + struct bdi_writeback *wb = container_of(to_delayed_work(work), + struct bdi_writeback, dwork); struct backing_dev_info *bdi = wb->bdi; long pages_written; + set_worker_desc("flush-%s", dev_name(bdi->dev)); current->flags |= PF_SWAPWRITE; - set_freezable(); - wb->last_active = jiffies; - - /* - * Our parent may run at a different priority, just set us to normal - */ - set_user_nice(current, 0); - - trace_writeback_thread_start(bdi); - while (!kthread_freezable_should_stop(NULL)) { + if (likely(!current_is_workqueue_rescuer() || + list_empty(&bdi->bdi_list))) { /* - * Remove own delayed wake-up timer, since we are already awake - * and we'll take care of the periodic write-back. + * The normal path. Keep writing back @bdi until its + * work_list is empty. Note that this path is also taken + * if @bdi is shutting down even when we're running off the + * rescuer as work_list needs to be drained. */ - del_timer(&wb->wakeup_timer); - - pages_written = wb_do_writeback(wb, 0); - + do { + pages_written = wb_do_writeback(wb, 0); + trace_writeback_pages_written(pages_written); + } while (!list_empty(&bdi->work_list)); + } else { + /* + * bdi_wq can't get enough workers and we're running off + * the emergency worker. Don't hog it. Hopefully, 1024 is + * enough for efficient IO. + */ + pages_written = writeback_inodes_wb(&bdi->wb, 1024, + WB_REASON_FORKER_THREAD); trace_writeback_pages_written(pages_written); - - if (pages_written) - wb->last_active = jiffies; - - set_current_state(TASK_INTERRUPTIBLE); - if (!list_empty(&bdi->work_list) || kthread_should_stop()) { - __set_current_state(TASK_RUNNING); - continue; - } - - if (wb_has_dirty_io(wb) && dirty_writeback_interval) - schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); - else { - /* - * We have nothing to do, so can go sleep without any - * timeout and save power. When a work is queued or - * something is made dirty - we will be woken up. - */ - schedule(); - } } - /* Flush any work that raced with us exiting */ - if (!list_empty(&bdi->work_list)) - wb_do_writeback(wb, 1); + if (!list_empty(&bdi->work_list) || + (wb_has_dirty_io(wb) && dirty_writeback_interval)) + queue_delayed_work(bdi_wq, &wb->dwork, + msecs_to_jiffies(dirty_writeback_interval * 10)); - trace_writeback_thread_stop(bdi); - return 0; + current->flags &= ~PF_SWAPWRITE; } - /* * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back * the whole world. diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index 8179e8bc4a3d..40d13c70ef51 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -287,5 +287,5 @@ const struct file_operations fscache_stats_fops = { .open = fscache_stats_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = single_release, }; diff --git a/fs/fuse/control.c b/fs/fuse/control.c index b7978b9f75ef..a0b0855d00a9 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -341,6 +341,7 @@ static struct file_system_type fuse_ctl_fs_type = { .mount = fuse_ctl_mount, .kill_sb = fuse_ctl_kill_sb, }; +MODULE_ALIAS_FS("fusectl"); int __init fuse_ctl_init(void) { diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 6f96a8def147..aef34b1e635e 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -38,6 +38,7 @@ #include <linux/device.h> #include <linux/file.h> #include <linux/fs.h> +#include <linux/aio.h> #include <linux/kdev_t.h> #include <linux/kthread.h> #include <linux/list.h> @@ -92,8 +93,9 @@ static ssize_t cuse_read(struct file *file, char __user *buf, size_t count, { loff_t pos = 0; struct iovec iov = { .iov_base = buf, .iov_len = count }; + struct fuse_io_priv io = { .async = 0, .file = file }; - return fuse_direct_io(file, &iov, 1, count, &pos, 0); + return fuse_direct_io(&io, &iov, 1, count, &pos, 0); } static ssize_t cuse_write(struct file *file, const char __user *buf, @@ -101,12 +103,13 @@ static ssize_t cuse_write(struct file *file, const char __user *buf, { loff_t pos = 0; struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; + struct fuse_io_priv io = { .async = 0, .file = file }; /* * No locking or generic_write_checks(), the server is * responsible for locking and sanity checks. */ - return fuse_direct_io(file, &iov, 1, count, &pos, 1); + return fuse_direct_io(&io, &iov, 1, count, &pos, 1); } static int cuse_open(struct inode *inode, struct file *file) @@ -422,7 +425,7 @@ static int cuse_send_init(struct cuse_conn *cc) BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE); - req = fuse_get_req(fc, 1); + req = fuse_get_req_for_background(fc, 1); if (IS_ERR(req)) { rc = PTR_ERR(req); goto err; @@ -504,7 +507,7 @@ static int cuse_channel_open(struct inode *inode, struct file *file) cc->fc.release = cuse_fc_release; cc->fc.connected = 1; - cc->fc.blocked = 0; + cc->fc.initialized = 1; rc = cuse_send_init(cc); if (rc) { fuse_conn_put(&cc->fc); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 11dfa0c3fb46..1d55f9465400 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -19,6 +19,7 @@ #include <linux/pipe_fs_i.h> #include <linux/swap.h> #include <linux/splice.h> +#include <linux/aio.h> MODULE_ALIAS_MISCDEV(FUSE_MINOR); MODULE_ALIAS("devname:fuse"); @@ -111,7 +112,7 @@ static void restore_sigs(sigset_t *oldset) sigprocmask(SIG_SETMASK, oldset, NULL); } -static void __fuse_get_request(struct fuse_req *req) +void __fuse_get_request(struct fuse_req *req) { atomic_inc(&req->count); } @@ -130,20 +131,30 @@ static void fuse_req_init_context(struct fuse_req *req) req->in.h.pid = current->pid; } -struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages) +static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background) +{ + return !fc->initialized || (for_background && fc->blocked); +} + +static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, + bool for_background) { struct fuse_req *req; - sigset_t oldset; - int intr; int err; - atomic_inc(&fc->num_waiting); - block_sigs(&oldset); - intr = wait_event_interruptible(fc->blocked_waitq, !fc->blocked); - restore_sigs(&oldset); - err = -EINTR; - if (intr) - goto out; + + if (fuse_block_alloc(fc, for_background)) { + sigset_t oldset; + int intr; + + block_sigs(&oldset); + intr = wait_event_interruptible_exclusive(fc->blocked_waitq, + !fuse_block_alloc(fc, for_background)); + restore_sigs(&oldset); + err = -EINTR; + if (intr) + goto out; + } err = -ENOTCONN; if (!fc->connected) @@ -151,19 +162,35 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages) req = fuse_request_alloc(npages); err = -ENOMEM; - if (!req) + if (!req) { + if (for_background) + wake_up(&fc->blocked_waitq); goto out; + } fuse_req_init_context(req); req->waiting = 1; + req->background = for_background; return req; out: atomic_dec(&fc->num_waiting); return ERR_PTR(err); } + +struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages) +{ + return __fuse_get_req(fc, npages, false); +} EXPORT_SYMBOL_GPL(fuse_get_req); +struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc, + unsigned npages) +{ + return __fuse_get_req(fc, npages, true); +} +EXPORT_SYMBOL_GPL(fuse_get_req_for_background); + /* * Return request in fuse_file->reserved_req. However that may * currently be in use. If that is the case, wait for it to become @@ -225,19 +252,31 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, struct fuse_req *req; atomic_inc(&fc->num_waiting); - wait_event(fc->blocked_waitq, !fc->blocked); + wait_event(fc->blocked_waitq, fc->initialized); req = fuse_request_alloc(0); if (!req) req = get_reserved_req(fc, file); fuse_req_init_context(req); req->waiting = 1; + req->background = 0; return req; } void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) { if (atomic_dec_and_test(&req->count)) { + if (unlikely(req->background)) { + /* + * We get here in the unlikely case that a background + * request was allocated but not sent + */ + spin_lock(&fc->lock); + if (!fc->blocked) + wake_up(&fc->blocked_waitq); + spin_unlock(&fc->lock); + } + if (req->waiting) atomic_dec(&fc->num_waiting); @@ -335,10 +374,15 @@ __releases(fc->lock) list_del(&req->intr_entry); req->state = FUSE_REQ_FINISHED; if (req->background) { - if (fc->num_background == fc->max_background) { + req->background = 0; + + if (fc->num_background == fc->max_background) fc->blocked = 0; - wake_up_all(&fc->blocked_waitq); - } + + /* Wake up next waiter, if any */ + if (!fc->blocked && waitqueue_active(&fc->blocked_waitq)) + wake_up(&fc->blocked_waitq); + if (fc->num_background == fc->congestion_threshold && fc->connected && fc->bdi_initialized) { clear_bdi_congested(&fc->bdi, BLK_RW_SYNC); @@ -442,6 +486,7 @@ __acquires(fc->lock) static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) { + BUG_ON(req->background); spin_lock(&fc->lock); if (!fc->connected) req->out.h.error = -ENOTCONN; @@ -469,7 +514,7 @@ EXPORT_SYMBOL_GPL(fuse_request_send); static void fuse_request_send_nowait_locked(struct fuse_conn *fc, struct fuse_req *req) { - req->background = 1; + BUG_ON(!req->background); fc->num_background++; if (fc->num_background == fc->max_background) fc->blocked = 1; @@ -1319,7 +1364,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, page_nr++; ret += buf->len; - if (pipe->inode) + if (pipe->files) do_wakeup = 1; } @@ -2071,6 +2116,7 @@ void fuse_abort_conn(struct fuse_conn *fc) if (fc->connected) { fc->connected = 0; fc->blocked = 0; + fc->initialized = 1; end_io_requests(fc); end_queued_requests(fc); end_polls(fc); @@ -2089,6 +2135,7 @@ int fuse_dev_release(struct inode *inode, struct file *file) spin_lock(&fc->lock); fc->connected = 0; fc->blocked = 0; + fc->initialized = 1; end_queued_requests(fc); end_polls(fc); wake_up_all(&fc->blocked_waitq); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index ff15522481d4..254df56b847b 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1562,10 +1562,9 @@ void fuse_release_nowrite(struct inode *inode) * vmtruncate() doesn't allow for this case, so do the rlimit checking * and the actual truncation by hand. */ -static int fuse_do_setattr(struct dentry *entry, struct iattr *attr, - struct file *file) +int fuse_do_setattr(struct inode *inode, struct iattr *attr, + struct file *file) { - struct inode *inode = entry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_req *req; struct fuse_setattr_in inarg; @@ -1574,9 +1573,6 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr, loff_t oldsize; int err; - if (!fuse_allow_current_process(fc)) - return -EACCES; - if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) attr->ia_valid |= ATTR_FORCE; @@ -1671,10 +1667,15 @@ error: static int fuse_setattr(struct dentry *entry, struct iattr *attr) { + struct inode *inode = entry->d_inode; + + if (!fuse_allow_current_process(get_fuse_conn(inode))) + return -EACCES; + if (attr->ia_valid & ATTR_FILE) - return fuse_do_setattr(entry, attr, attr->ia_file); + return fuse_do_setattr(inode, attr, attr->ia_file); else - return fuse_do_setattr(entry, attr, NULL); + return fuse_do_setattr(inode, attr, NULL); } static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry, diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 34b80ba95bad..d1c9b85b3f58 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -15,6 +15,7 @@ #include <linux/module.h> #include <linux/compat.h> #include <linux/swap.h> +#include <linux/aio.h> static const struct file_operations fuse_direct_io_file_operations; @@ -126,11 +127,13 @@ static void fuse_file_put(struct fuse_file *ff, bool sync) struct fuse_req *req = ff->reserved_req; if (sync) { + req->background = 0; fuse_request_send(ff->fc, req); path_put(&req->misc.release.path); fuse_put_request(ff->fc, req); } else { req->end = fuse_release_end; + req->background = 1; fuse_request_send_background(ff->fc, req); } kfree(ff); @@ -282,6 +285,7 @@ void fuse_sync_release(struct fuse_file *ff, int flags) WARN_ON(atomic_read(&ff->count) > 1); fuse_prepare_release(ff, flags, FUSE_RELEASE); ff->reserved_req->force = 1; + ff->reserved_req->background = 0; fuse_request_send(ff->fc, ff->reserved_req); fuse_put_request(ff->fc, ff->reserved_req); kfree(ff); @@ -491,9 +495,115 @@ void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, req->out.args[0].size = count; } -static size_t fuse_send_read(struct fuse_req *req, struct file *file, +static void fuse_release_user_pages(struct fuse_req *req, int write) +{ + unsigned i; + + for (i = 0; i < req->num_pages; i++) { + struct page *page = req->pages[i]; + if (write) + set_page_dirty_lock(page); + put_page(page); + } +} + +/** + * In case of short read, the caller sets 'pos' to the position of + * actual end of fuse request in IO request. Otherwise, if bytes_requested + * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1. + * + * An example: + * User requested DIO read of 64K. It was splitted into two 32K fuse requests, + * both submitted asynchronously. The first of them was ACKed by userspace as + * fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The + * second request was ACKed as short, e.g. only 1K was read, resulting in + * pos == 33K. + * + * Thus, when all fuse requests are completed, the minimal non-negative 'pos' + * will be equal to the length of the longest contiguous fragment of + * transferred data starting from the beginning of IO request. + */ +static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) +{ + int left; + + spin_lock(&io->lock); + if (err) + io->err = io->err ? : err; + else if (pos >= 0 && (io->bytes < 0 || pos < io->bytes)) + io->bytes = pos; + + left = --io->reqs; + spin_unlock(&io->lock); + + if (!left) { + long res; + + if (io->err) + res = io->err; + else if (io->bytes >= 0 && io->write) + res = -EIO; + else { + res = io->bytes < 0 ? io->size : io->bytes; + + if (!is_sync_kiocb(io->iocb)) { + struct path *path = &io->iocb->ki_filp->f_path; + struct inode *inode = path->dentry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fc->lock); + fi->attr_version = ++fc->attr_version; + spin_unlock(&fc->lock); + } + } + + aio_complete(io->iocb, res, 0); + kfree(io); + } +} + +static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req) +{ + struct fuse_io_priv *io = req->io; + ssize_t pos = -1; + + fuse_release_user_pages(req, !io->write); + + if (io->write) { + if (req->misc.write.in.size != req->misc.write.out.size) + pos = req->misc.write.in.offset - io->offset + + req->misc.write.out.size; + } else { + if (req->misc.read.in.size != req->out.args[0].size) + pos = req->misc.read.in.offset - io->offset + + req->out.args[0].size; + } + + fuse_aio_complete(io, req->out.h.error, pos); +} + +static size_t fuse_async_req_send(struct fuse_conn *fc, struct fuse_req *req, + size_t num_bytes, struct fuse_io_priv *io) +{ + spin_lock(&io->lock); + io->size += num_bytes; + io->reqs++; + spin_unlock(&io->lock); + + req->io = io; + req->end = fuse_aio_complete_req; + + __fuse_get_request(req); + fuse_request_send_background(fc, req); + + return num_bytes; +} + +static size_t fuse_send_read(struct fuse_req *req, struct fuse_io_priv *io, loff_t pos, size_t count, fl_owner_t owner) { + struct file *file = io->file; struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; @@ -504,6 +614,10 @@ static size_t fuse_send_read(struct fuse_req *req, struct file *file, inarg->read_flags |= FUSE_READ_LOCKOWNER; inarg->lock_owner = fuse_lock_owner_id(fc, owner); } + + if (io->async) + return fuse_async_req_send(fc, req, count, io); + fuse_request_send(fc, req); return req->out.args[0].size; } @@ -524,6 +638,7 @@ static void fuse_read_update_size(struct inode *inode, loff_t size, static int fuse_readpage(struct file *file, struct page *page) { + struct fuse_io_priv io = { .async = 0, .file = file }; struct inode *inode = page->mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_req *req; @@ -556,7 +671,7 @@ static int fuse_readpage(struct file *file, struct page *page) req->num_pages = 1; req->pages[0] = page; req->page_descs[0].length = count; - num_read = fuse_send_read(req, file, pos, count, NULL); + num_read = fuse_send_read(req, &io, pos, count, NULL); err = req->out.h.error; fuse_put_request(fc, req); @@ -661,7 +776,12 @@ static int fuse_readpages_fill(void *_data, struct page *page) int nr_alloc = min_t(unsigned, data->nr_pages, FUSE_MAX_PAGES_PER_REQ); fuse_send_readpages(req, data->file); - data->req = req = fuse_get_req(fc, nr_alloc); + if (fc->async_read) + req = fuse_get_req_for_background(fc, nr_alloc); + else + req = fuse_get_req(fc, nr_alloc); + + data->req = req; if (IS_ERR(req)) { unlock_page(page); return PTR_ERR(req); @@ -696,7 +816,10 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, data.file = file; data.inode = inode; - data.req = fuse_get_req(fc, nr_alloc); + if (fc->async_read) + data.req = fuse_get_req_for_background(fc, nr_alloc); + else + data.req = fuse_get_req(fc, nr_alloc); data.nr_pages = nr_pages; err = PTR_ERR(data.req); if (IS_ERR(data.req)) @@ -758,9 +881,10 @@ static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff, req->out.args[0].value = outarg; } -static size_t fuse_send_write(struct fuse_req *req, struct file *file, +static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io, loff_t pos, size_t count, fl_owner_t owner) { + struct file *file = io->file; struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; struct fuse_write_in *inarg = &req->misc.write.in; @@ -771,6 +895,10 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file, inarg->write_flags |= FUSE_WRITE_LOCKOWNER; inarg->lock_owner = fuse_lock_owner_id(fc, owner); } + + if (io->async) + return fuse_async_req_send(fc, req, count, io); + fuse_request_send(fc, req); return req->misc.write.out.size; } @@ -794,11 +922,12 @@ static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file, size_t res; unsigned offset; unsigned i; + struct fuse_io_priv io = { .async = 0, .file = file }; for (i = 0; i < req->num_pages; i++) fuse_wait_on_page_writeback(inode, req->pages[i]->index); - res = fuse_send_write(req, file, pos, count, NULL); + res = fuse_send_write(req, &io, pos, count, NULL); offset = req->page_descs[0].offset; count = res; @@ -971,7 +1100,6 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, return err; count = ocount; - sb_start_write(inode->i_sb); mutex_lock(&inode->i_mutex); /* We can write back this queue in page reclaim */ @@ -1030,23 +1158,10 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, out: current->backing_dev_info = NULL; mutex_unlock(&inode->i_mutex); - sb_end_write(inode->i_sb); return written ? written : err; } -static void fuse_release_user_pages(struct fuse_req *req, int write) -{ - unsigned i; - - for (i = 0; i < req->num_pages; i++) { - struct page *page = req->pages[i]; - if (write) - set_page_dirty_lock(page); - put_page(page); - } -} - static inline void fuse_page_descs_length_init(struct fuse_req *req, unsigned index, unsigned nr_pages) { @@ -1148,10 +1263,11 @@ static inline int fuse_iter_npages(const struct iov_iter *ii_p) return min(npages, FUSE_MAX_PAGES_PER_REQ); } -ssize_t fuse_direct_io(struct file *file, const struct iovec *iov, +ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, unsigned long nr_segs, size_t count, loff_t *ppos, int write) { + struct file *file = io->file; struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; size_t nmax = write ? fc->max_write : fc->max_read; @@ -1177,11 +1293,12 @@ ssize_t fuse_direct_io(struct file *file, const struct iovec *iov, } if (write) - nres = fuse_send_write(req, file, pos, nbytes, owner); + nres = fuse_send_write(req, io, pos, nbytes, owner); else - nres = fuse_send_read(req, file, pos, nbytes, owner); + nres = fuse_send_read(req, io, pos, nbytes, owner); - fuse_release_user_pages(req, !write); + if (!io->async) + fuse_release_user_pages(req, !write); if (req->out.h.error) { if (!res) res = req->out.h.error; @@ -1211,17 +1328,19 @@ ssize_t fuse_direct_io(struct file *file, const struct iovec *iov, } EXPORT_SYMBOL_GPL(fuse_direct_io); -static ssize_t __fuse_direct_read(struct file *file, const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos) +static ssize_t __fuse_direct_read(struct fuse_io_priv *io, + const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos, + size_t count) { ssize_t res; + struct file *file = io->file; struct inode *inode = file_inode(file); if (is_bad_inode(inode)) return -EIO; - res = fuse_direct_io(file, iov, nr_segs, iov_length(iov, nr_segs), - ppos, 0); + res = fuse_direct_io(io, iov, nr_segs, count, ppos, 0); fuse_invalidate_attr(inode); @@ -1231,23 +1350,23 @@ static ssize_t __fuse_direct_read(struct file *file, const struct iovec *iov, static ssize_t fuse_direct_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { + struct fuse_io_priv io = { .async = 0, .file = file }; struct iovec iov = { .iov_base = buf, .iov_len = count }; - return __fuse_direct_read(file, &iov, 1, ppos); + return __fuse_direct_read(&io, &iov, 1, ppos, count); } -static ssize_t __fuse_direct_write(struct file *file, const struct iovec *iov, +static ssize_t __fuse_direct_write(struct fuse_io_priv *io, + const struct iovec *iov, unsigned long nr_segs, loff_t *ppos) { + struct file *file = io->file; struct inode *inode = file_inode(file); size_t count = iov_length(iov, nr_segs); ssize_t res; res = generic_write_checks(file, ppos, &count, 0); - if (!res) { - res = fuse_direct_io(file, iov, nr_segs, count, ppos, 1); - if (res > 0) - fuse_write_update_size(inode, *ppos); - } + if (!res) + res = fuse_direct_io(io, iov, nr_segs, count, ppos, 1); fuse_invalidate_attr(inode); @@ -1260,13 +1379,16 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf, struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; struct inode *inode = file_inode(file); ssize_t res; + struct fuse_io_priv io = { .async = 0, .file = file }; if (is_bad_inode(inode)) return -EIO; /* Don't allow parallel writes to the same file */ mutex_lock(&inode->i_mutex); - res = __fuse_direct_write(file, &iov, 1, ppos); + res = __fuse_direct_write(&io, &iov, 1, ppos); + if (res > 0) + fuse_write_update_size(inode, *ppos); mutex_unlock(&inode->i_mutex); return res; @@ -1375,6 +1497,7 @@ static int fuse_writepage_locked(struct page *page) if (!req) goto err; + req->background = 1; /* writeback always goes to bg_queue */ tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); if (!tmp_page) goto err_free; @@ -2228,21 +2351,93 @@ int fuse_notify_poll_wakeup(struct fuse_conn *fc, return 0; } +static void fuse_do_truncate(struct file *file) +{ + struct inode *inode = file->f_mapping->host; + struct iattr attr; + + attr.ia_valid = ATTR_SIZE; + attr.ia_size = i_size_read(inode); + + attr.ia_file = file; + attr.ia_valid |= ATTR_FILE; + + fuse_do_setattr(inode, &attr, file); +} + static ssize_t fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { ssize_t ret = 0; - struct file *file = NULL; + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; loff_t pos = 0; + struct inode *inode; + loff_t i_size; + size_t count = iov_length(iov, nr_segs); + struct fuse_io_priv *io; - file = iocb->ki_filp; pos = offset; + inode = file->f_mapping->host; + i_size = i_size_read(inode); + + /* optimization for short read */ + if (rw != WRITE && offset + count > i_size) { + if (offset >= i_size) + return 0; + count = i_size - offset; + } + + io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL); + if (!io) + return -ENOMEM; + spin_lock_init(&io->lock); + io->reqs = 1; + io->bytes = -1; + io->size = 0; + io->offset = offset; + io->write = (rw == WRITE); + io->err = 0; + io->file = file; + /* + * By default, we want to optimize all I/Os with async request + * submission to the client filesystem if supported. + */ + io->async = ff->fc->async_dio; + io->iocb = iocb; + + /* + * We cannot asynchronously extend the size of a file. We have no method + * to wait on real async I/O requests, so we must submit this request + * synchronously. + */ + if (!is_sync_kiocb(iocb) && (offset + count > i_size)) + io->async = false; if (rw == WRITE) - ret = __fuse_direct_write(file, iov, nr_segs, &pos); + ret = __fuse_direct_write(io, iov, nr_segs, &pos); else - ret = __fuse_direct_read(file, iov, nr_segs, &pos); + ret = __fuse_direct_read(io, iov, nr_segs, &pos, count); + + if (io->async) { + fuse_aio_complete(io, ret < 0 ? ret : 0, -1); + + /* we have a non-extending, async request, so return */ + if (ret > 0 && !is_sync_kiocb(iocb)) + return -EIOCBQUEUED; + + ret = wait_on_sync_kiocb(iocb); + } else { + kfree(io); + } + + if (rw == WRITE) { + if (ret > 0) + fuse_write_update_size(inode, pos); + else if (ret < 0 && offset + count > i_size) + fuse_do_truncate(file); + } return ret; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 6aeba864f070..fde7249a3a96 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -228,6 +228,20 @@ enum fuse_req_state { FUSE_REQ_FINISHED }; +/** The request IO state (for asynchronous processing) */ +struct fuse_io_priv { + int async; + spinlock_t lock; + unsigned reqs; + ssize_t bytes; + size_t size; + __u64 offset; + bool write; + int err; + struct kiocb *iocb; + struct file *file; +}; + /** * A request to the client */ @@ -332,6 +346,9 @@ struct fuse_req { /** Inode used in the request or NULL */ struct inode *inode; + /** AIO control block */ + struct fuse_io_priv *io; + /** Link on fi->writepages */ struct list_head writepages_entry; @@ -417,6 +434,10 @@ struct fuse_conn { /** Batching of FORGET requests (positive indicates FORGET batch) */ int forget_batch; + /** Flag indicating that INIT reply has been received. Allocating + * any fuse request will be suspended until the flag is set */ + int initialized; + /** Flag indicating if connection is blocked. This will be the case before the INIT reply is received, and if there are too many outstading backgrounds requests */ @@ -520,6 +541,9 @@ struct fuse_conn { /** Does the filesystem want adaptive readdirplus? */ unsigned readdirplus_auto:1; + /** Does the filesystem support asynchronous direct-IO submission? */ + unsigned async_dio:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -708,6 +732,13 @@ void fuse_request_free(struct fuse_req *req); * caller should specify # elements in req->pages[] explicitly */ struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages); +struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc, + unsigned npages); + +/* + * Increment reference count on request + */ +void __fuse_get_request(struct fuse_req *req); /** * Get a request, may fail with -ENOMEM, @@ -823,7 +854,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, bool isdir); -ssize_t fuse_direct_io(struct file *file, const struct iovec *iov, +ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, unsigned long nr_segs, size_t count, loff_t *ppos, int write); long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, @@ -835,4 +866,7 @@ int fuse_dev_release(struct inode *inode, struct file *file); void fuse_write_update_size(struct inode *inode, loff_t pos); +int fuse_do_setattr(struct inode *inode, struct iattr *attr, + struct file *file); + #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index df00993ed108..6201f81e4d3a 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -346,6 +346,7 @@ static void fuse_send_destroy(struct fuse_conn *fc) fc->destroy_req = NULL; req->in.h.opcode = FUSE_DESTROY; req->force = 1; + req->background = 0; fuse_request_send(fc, req); fuse_put_request(fc, req); } @@ -362,6 +363,7 @@ void fuse_conn_kill(struct fuse_conn *fc) spin_lock(&fc->lock); fc->connected = 0; fc->blocked = 0; + fc->initialized = 1; spin_unlock(&fc->lock); /* Flush all readers on this fs */ kill_fasync(&fc->fasync, SIGIO, POLL_IN); @@ -581,7 +583,8 @@ void fuse_conn_init(struct fuse_conn *fc) fc->khctr = 0; fc->polled_files = RB_ROOT; fc->reqctr = 0; - fc->blocked = 1; + fc->blocked = 0; + fc->initialized = 0; fc->attr_version = 1; get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); } @@ -868,6 +871,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->do_readdirplus = 1; if (arg->flags & FUSE_READDIRPLUS_AUTO) fc->readdirplus_auto = 1; + if (arg->flags & FUSE_ASYNC_DIO) + fc->async_dio = 1; } else { ra_pages = fc->max_read / PAGE_CACHE_SIZE; fc->no_lock = 1; @@ -880,7 +885,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->max_write = max_t(unsigned, 4096, fc->max_write); fc->conn_init = 1; } - fc->blocked = 0; + fc->initialized = 1; wake_up_all(&fc->blocked_waitq); } @@ -895,7 +900,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | - FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO; + FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); @@ -1043,6 +1048,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) init_req = fuse_request_alloc(0); if (!init_req) goto err_put_root; + init_req->background = 1; if (is_bdev) { fc->destroy_req = fuse_request_alloc(0); @@ -1117,6 +1123,7 @@ static struct file_system_type fuse_fs_type = { .mount = fuse_mount, .kill_sb = fuse_kill_sb_anon, }; +MODULE_ALIAS_FS("fuse"); #ifdef CONFIG_BLOCK static struct dentry *fuse_mount_blk(struct file_system_type *fs_type, @@ -1146,6 +1153,7 @@ static struct file_system_type fuseblk_fs_type = { .kill_sb = fuse_kill_sb_blk, .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE, }; +MODULE_ALIAS_FS("fuseblk"); static inline int register_fuseblk(void) { diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 24f414f0ce61..0bad69ed6336 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -20,6 +20,7 @@ #include <linux/swap.h> #include <linux/gfs2_ondisk.h> #include <linux/backing-dev.h> +#include <linux/aio.h> #include "gfs2.h" #include "incore.h" @@ -1055,7 +1056,7 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) if (atomic_read(&bh->b_count)) goto cannot_release; bd = bh->b_private; - if (bd && bd->bd_ail) + if (bd && bd->bd_tr) goto cannot_release; if (buffer_pinned(bh) || buffer_dirty(bh)) goto not_possible; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 5e83657f046e..1dc9a13ce6bb 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -787,7 +787,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, goto out_rlist; if (gfs2_rs_active(ip->i_res)) /* needs to be done with the rgrp glock held */ - gfs2_rs_deltree(ip, ip->i_res); + gfs2_rs_deltree(ip->i_res); error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + RES_INDIRECT + RES_STATFS + RES_QUOTA, diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 019f45e45097..acd16764b133 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -25,6 +25,7 @@ #include <asm/uaccess.h> #include <linux/dlm.h> #include <linux/dlm_plock.h> +#include <linux/aio.h> #include "gfs2.h" #include "incore.h" @@ -923,8 +924,11 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) cmd = F_SETLK; fl->fl_type = F_UNLCK; } - if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) { + if (fl->fl_type == F_UNLCK) + posix_lock_file_wait(file, fl); return -EIO; + } if (IS_GETLK(cmd)) return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl); else if (fl->fl_type == F_UNLCK) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index cf3515546739..9435384562a2 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -912,7 +912,7 @@ int gfs2_glock_wait(struct gfs2_holder *gh) */ static void handle_callback(struct gfs2_glock *gl, unsigned int state, - unsigned long delay) + unsigned long delay, bool remote) { int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE; @@ -925,8 +925,8 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state, gl->gl_demote_state = LM_ST_UNLOCKED; } if (gl->gl_ops->go_callback) - gl->gl_ops->go_callback(gl); - trace_gfs2_demote_rq(gl); + gl->gl_ops->go_callback(gl, remote); + trace_gfs2_demote_rq(gl, remote); } void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...) @@ -1017,11 +1017,11 @@ do_cancel: return; trap_recursive: - print_symbol(KERN_ERR "original: %s\n", gh2->gh_ip); + printk(KERN_ERR "original: %pSR\n", (void *)gh2->gh_ip); printk(KERN_ERR "pid: %d\n", pid_nr(gh2->gh_owner_pid)); printk(KERN_ERR "lock type: %d req lock state : %d\n", gh2->gh_gl->gl_name.ln_type, gh2->gh_state); - print_symbol(KERN_ERR "new: %s\n", gh->gh_ip); + printk(KERN_ERR "new: %pSR\n", (void *)gh->gh_ip); printk(KERN_ERR "pid: %d\n", pid_nr(gh->gh_owner_pid)); printk(KERN_ERR "lock type: %d req lock state : %d\n", gh->gh_gl->gl_name.ln_type, gh->gh_state); @@ -1091,7 +1091,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh) spin_lock(&gl->gl_spin); if (gh->gh_flags & GL_NOCACHE) - handle_callback(gl, LM_ST_UNLOCKED, 0); + handle_callback(gl, LM_ST_UNLOCKED, 0, false); list_del_init(&gh->gh_list); if (find_first_holder(gl) == NULL) { @@ -1279,19 +1279,6 @@ void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs) gfs2_glock_dq(&ghs[num_gh]); } -/** - * gfs2_glock_dq_uninit_m - release multiple glocks - * @num_gh: the number of structures - * @ghs: an array of struct gfs2_holder structures - * - */ - -void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs) -{ - while (num_gh--) - gfs2_glock_dq_uninit(&ghs[num_gh]); -} - void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) { unsigned long delay = 0; @@ -1309,7 +1296,7 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) } spin_lock(&gl->gl_spin); - handle_callback(gl, state, delay); + handle_callback(gl, state, delay, true); spin_unlock(&gl->gl_spin); if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) gfs2_glock_put(gl); @@ -1422,7 +1409,7 @@ __acquires(&lru_lock) spin_unlock(&lru_lock); spin_lock(&gl->gl_spin); if (demote_ok(gl)) - handle_callback(gl, LM_ST_UNLOCKED, 0); + handle_callback(gl, LM_ST_UNLOCKED, 0, false); WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags)); smp_mb__after_clear_bit(); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) @@ -1547,7 +1534,7 @@ static void clear_glock(struct gfs2_glock *gl) spin_lock(&gl->gl_spin); if (gl->gl_state != LM_ST_UNLOCKED) - handle_callback(gl, LM_ST_UNLOCKED, 0); + handle_callback(gl, LM_ST_UNLOCKED, 0, false); spin_unlock(&gl->gl_spin); gfs2_glock_hold(gl); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) @@ -1590,6 +1577,7 @@ static void dump_glock_func(struct gfs2_glock *gl) void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) { set_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags); + flush_workqueue(glock_workqueue); glock_hash_walk(clear_glock, sdp); flush_workqueue(glock_workqueue); wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0); diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index fd580b7861d5..69f66e3d22bf 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -201,7 +201,6 @@ extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number, struct gfs2_holder *gh); extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); -extern void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs); extern int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl); #define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0) extern __printf(2, 3) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 444b6503ebc4..c66e99c97571 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -515,12 +515,12 @@ static int trans_go_demote_ok(const struct gfs2_glock *gl) * * gl_spin lock is held while calling this */ -static void iopen_go_callback(struct gfs2_glock *gl) +static void iopen_go_callback(struct gfs2_glock *gl, bool remote) { struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object; struct gfs2_sbd *sdp = gl->gl_sbd; - if (sdp->sd_vfs->s_flags & MS_RDONLY) + if (!remote || (sdp->sd_vfs->s_flags & MS_RDONLY)) return; if (gl->gl_demote_state == LM_ST_UNLOCKED && diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 156e42ec84ea..26aabd7caba7 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -31,7 +31,6 @@ struct gfs2_holder; struct gfs2_glock; struct gfs2_quota_data; struct gfs2_trans; -struct gfs2_ail; struct gfs2_jdesc; struct gfs2_sbd; struct lm_lockops; @@ -53,7 +52,7 @@ struct gfs2_log_header_host { struct gfs2_log_operations { void (*lo_before_commit) (struct gfs2_sbd *sdp); - void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai); + void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr); void (*lo_before_scan) (struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, int pass); int (*lo_scan_elements) (struct gfs2_jdesc *jd, unsigned int start, @@ -139,7 +138,7 @@ struct gfs2_bufdata { struct list_head bd_list; const struct gfs2_log_operations *bd_ops; - struct gfs2_ail *bd_ail; + struct gfs2_trans *bd_tr; struct list_head bd_ail_st_list; struct list_head bd_ail_gl_list; }; @@ -211,7 +210,7 @@ struct gfs2_glock_operations { int (*go_lock) (struct gfs2_holder *gh); void (*go_unlock) (struct gfs2_holder *gh); int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl); - void (*go_callback) (struct gfs2_glock *gl); + void (*go_callback)(struct gfs2_glock *gl, bool remote); const int go_type; const unsigned long go_flags; #define GLOF_ASPACE 1 @@ -433,6 +432,7 @@ struct gfs2_trans { struct gfs2_holder tr_t_gh; int tr_touched; + int tr_attached; unsigned int tr_num_buf_new; unsigned int tr_num_databuf_new; @@ -440,14 +440,12 @@ struct gfs2_trans { unsigned int tr_num_databuf_rm; unsigned int tr_num_revoke; unsigned int tr_num_revoke_rm; -}; -struct gfs2_ail { - struct list_head ai_list; + struct list_head tr_list; - unsigned int ai_first; - struct list_head ai_ail1_list; - struct list_head ai_ail2_list; + unsigned int tr_first; + struct list_head tr_ail1_list; + struct list_head tr_ail2_list; }; struct gfs2_journal_extent { @@ -588,6 +586,7 @@ struct lm_lockstruct { struct dlm_lksb ls_control_lksb; /* control_lock */ char ls_control_lvb[GDLM_LVB_SIZE]; /* control_lock lvb */ struct completion ls_sync_wait; /* {control,mounted}_{lock,unlock} */ + char *ls_lvb_bits; spinlock_t ls_recover_spin; /* protects following fields */ unsigned long ls_recover_flags; /* DFL_ */ @@ -709,6 +708,7 @@ struct gfs2_sbd { spinlock_t sd_log_lock; + struct gfs2_trans *sd_log_tr; unsigned int sd_log_blks_reserved; unsigned int sd_log_commited_buf; unsigned int sd_log_commited_databuf; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index cc00bd1d1f87..8833a4f264e3 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -392,11 +392,15 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags) int error; int dblocks = 1; - error = gfs2_inplace_reserve(ip, RES_DINODE, flags); + error = gfs2_quota_lock_check(ip); if (error) goto out; - error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS, 0); + error = gfs2_inplace_reserve(ip, RES_DINODE, flags); + if (error) + goto out_quota; + + error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, 0); if (error) goto out_ipreserv; @@ -409,6 +413,8 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags) out_ipreserv: gfs2_inplace_release(ip); +out_quota: + gfs2_quota_unlock(ip); out: return error; } @@ -440,59 +446,27 @@ static void gfs2_init_dir(struct buffer_head *dibh, */ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip, - const char *symname, struct buffer_head **bhp) + const char *symname) { - struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct gfs2_dinode *di; struct buffer_head *dibh; - struct timespec tv = CURRENT_TIME; dibh = gfs2_meta_new(ip->i_gl, ip->i_no_addr); gfs2_trans_add_meta(ip->i_gl, dibh); - gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI); - gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); di = (struct gfs2_dinode *)dibh->b_data; + gfs2_dinode_out(ip, di); - di->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino); - di->di_num.no_addr = cpu_to_be64(ip->i_no_addr); - di->di_mode = cpu_to_be32(ip->i_inode.i_mode); - di->di_uid = cpu_to_be32(i_uid_read(&ip->i_inode)); - di->di_gid = cpu_to_be32(i_gid_read(&ip->i_inode)); - di->di_nlink = 0; - di->di_size = cpu_to_be64(ip->i_inode.i_size); - di->di_blocks = cpu_to_be64(1); - di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(tv.tv_sec); di->di_major = cpu_to_be32(MAJOR(ip->i_inode.i_rdev)); di->di_minor = cpu_to_be32(MINOR(ip->i_inode.i_rdev)); - di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_no_addr); - di->di_generation = cpu_to_be64(ip->i_generation); - di->di_flags = 0; di->__pad1 = 0; - di->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) ? GFS2_FORMAT_DE : 0); - di->di_height = 0; di->__pad2 = 0; di->__pad3 = 0; - di->di_depth = 0; - di->di_entries = 0; memset(&di->__pad4, 0, sizeof(di->__pad4)); - di->di_eattr = 0; - di->di_atime_nsec = cpu_to_be32(tv.tv_nsec); - di->di_mtime_nsec = cpu_to_be32(tv.tv_nsec); - di->di_ctime_nsec = cpu_to_be32(tv.tv_nsec); memset(&di->di_reserved, 0, sizeof(di->di_reserved)); + gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); switch(ip->i_inode.i_mode & S_IFMT) { - case S_IFREG: - if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) || - gfs2_tune_get(sdp, gt_new_files_jdata)) - di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA); - break; case S_IFDIR: - di->di_flags |= cpu_to_be32(dip->i_diskflags & - GFS2_DIF_INHERIT_JDATA); - di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA); - di->di_size = cpu_to_be64(sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)); - di->di_entries = cpu_to_be32(2); gfs2_init_dir(dibh, dip); break; case S_IFLNK: @@ -501,63 +475,17 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip, } set_buffer_uptodate(dibh); - - *bhp = dibh; -} - -static int make_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip, - const char *symname, struct buffer_head **bhp) -{ - struct inode *inode = &ip->i_inode; - struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); - int error; - - error = gfs2_rindex_update(sdp); - if (error) - return error; - - error = gfs2_quota_lock(dip, inode->i_uid, inode->i_gid); - if (error) - return error; - - error = gfs2_quota_check(dip, inode->i_uid, inode->i_gid); - if (error) - goto out_quota; - - error = gfs2_trans_begin(sdp, RES_DINODE + RES_QUOTA, 0); - if (error) - goto out_quota; - - init_dinode(dip, ip, symname, bhp); - gfs2_quota_change(dip, +1, inode->i_uid, inode->i_gid); - gfs2_trans_end(sdp); - -out_quota: - gfs2_quota_unlock(dip); - return error; + brelse(dibh); } static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, - struct gfs2_inode *ip) + struct gfs2_inode *ip, int arq) { struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); - int alloc_required; - struct buffer_head *dibh; int error; - error = gfs2_rindex_update(sdp); - if (error) - return error; - - error = gfs2_quota_lock(dip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); - if (error) - goto fail; - - error = alloc_required = gfs2_diradd_alloc_required(&dip->i_inode, name); - if (alloc_required < 0) - goto fail_quota_locks; - if (alloc_required) { - error = gfs2_quota_check(dip, dip->i_inode.i_uid, dip->i_inode.i_gid); + if (arq) { + error = gfs2_quota_lock_check(dip); if (error) goto fail_quota_locks; @@ -581,26 +509,12 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, if (error) goto fail_end_trans; - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - goto fail_end_trans; - set_nlink(&ip->i_inode, S_ISDIR(ip->i_inode.i_mode) ? 2 : 1); - gfs2_trans_add_meta(ip->i_gl, dibh); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); - return 0; - fail_end_trans: gfs2_trans_end(sdp); - fail_ipreserv: - if (alloc_required) - gfs2_inplace_release(dip); - + gfs2_inplace_release(dip); fail_quota_locks: gfs2_quota_unlock(dip); - -fail: return error; } @@ -650,8 +564,8 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct gfs2_glock *io_gl; int error; - struct buffer_head *bh = NULL; u32 aflags = 0; + int arq; if (!name->len || name->len > GFS2_FNAMESIZE) return -ENAMETOOLONG; @@ -660,6 +574,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) return error; + error = gfs2_rindex_update(sdp); + if (error) + return error; + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); if (error) goto fail; @@ -674,22 +592,48 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_gunlock; + arq = error = gfs2_diradd_alloc_required(dir, name); + if (error < 0) + goto fail_gunlock; + inode = new_inode(sdp->sd_vfs); - if (!inode) { - gfs2_glock_dq_uninit(ghs); - return -ENOMEM; - } + error = -ENOMEM; + if (!inode) + goto fail_gunlock; + ip = GFS2_I(inode); error = gfs2_rs_alloc(ip); if (error) goto fail_free_inode; - set_bit(GIF_INVALID, &ip->i_flags); inode->i_mode = mode; + set_nlink(inode, S_ISDIR(mode) ? 2 : 1); inode->i_rdev = dev; inode->i_size = size; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + gfs2_set_inode_blocks(inode, 1); munge_mode_uid_gid(dip, inode); ip->i_goal = dip->i_goal; + ip->i_diskflags = 0; + ip->i_eattr = 0; + ip->i_height = 0; + ip->i_depth = 0; + ip->i_entries = 0; + + switch(mode & S_IFMT) { + case S_IFREG: + if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) || + gfs2_tune_get(sdp, gt_new_files_jdata)) + ip->i_diskflags |= GFS2_DIF_JDATA; + gfs2_set_aops(inode); + break; + case S_IFDIR: + ip->i_diskflags |= (dip->i_diskflags & GFS2_DIF_INHERIT_JDATA); + ip->i_diskflags |= GFS2_DIF_JDATA; + ip->i_entries = 2; + break; + } + gfs2_set_inode_flags(inode); if ((GFS2_I(sdp->sd_root_dir->d_inode) == dip) || (dip->i_diskflags & GFS2_DIF_TOPDIR)) @@ -708,10 +652,13 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_free_inode; - error = make_dinode(dip, ip, symname, &bh); + error = gfs2_trans_begin(sdp, RES_DINODE, 0); if (error) goto fail_gunlock2; + init_dinode(dip, ip, symname); + gfs2_trans_end(sdp); + error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_iopen_glops, CREATE, &io_gl); if (error) goto fail_gunlock2; @@ -725,10 +672,6 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, gfs2_set_iop(inode); insert_inode_hash(inode); - error = gfs2_inode_refresh(ip); - if (error) - goto fail_gunlock3; - error = gfs2_acl_create(dip, inode); if (error) goto fail_gunlock3; @@ -737,18 +680,13 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_gunlock3; - error = link_dinode(dip, name, ip); + error = link_dinode(dip, name, ip, arq); if (error) goto fail_gunlock3; - if (bh) - brelse(bh); - - gfs2_trans_end(sdp); - gfs2_inplace_release(dip); - gfs2_quota_unlock(dip); mark_inode_dirty(inode); - gfs2_glock_dq_uninit_m(2, ghs); + gfs2_glock_dq_uninit(ghs); + gfs2_glock_dq_uninit(ghs + 1); d_instantiate(dentry, inode); return 0; @@ -769,12 +707,12 @@ fail_free_inode: fail_gunlock: gfs2_glock_dq_uninit(ghs); if (inode && !IS_ERR(inode)) { + clear_nlink(inode); + mark_inode_dirty(inode); set_bit(GIF_ALLOC_FAILED, &GFS2_I(inode)->i_flags); iput(inode); } fail: - if (bh) - brelse(bh); return error; } @@ -1151,7 +1089,9 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry, static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { - return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, 0, 0); + struct gfs2_sbd *sdp = GFS2_SB(dir); + unsigned dsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode); + return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, dsize, 0); } /** diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 9802de0f85e6..c8423d6de6c3 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -483,12 +483,8 @@ static void control_lvb_write(struct lm_lockstruct *ls, uint32_t lvb_gen, static int all_jid_bits_clear(char *lvb) { - int i; - for (i = JID_BITMAP_OFFSET; i < GDLM_LVB_SIZE; i++) { - if (lvb[i]) - return 0; - } - return 1; + return !memchr_inv(lvb + JID_BITMAP_OFFSET, 0, + GDLM_LVB_SIZE - JID_BITMAP_OFFSET); } static void sync_wait_cb(void *arg) @@ -580,7 +576,6 @@ static void gfs2_control_func(struct work_struct *work) { struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_control_work.work); struct lm_lockstruct *ls = &sdp->sd_lockstruct; - char lvb_bits[GDLM_LVB_SIZE]; uint32_t block_gen, start_gen, lvb_gen, flags; int recover_set = 0; int write_lvb = 0; @@ -634,7 +629,7 @@ static void gfs2_control_func(struct work_struct *work) return; } - control_lvb_read(ls, &lvb_gen, lvb_bits); + control_lvb_read(ls, &lvb_gen, ls->ls_lvb_bits); spin_lock(&ls->ls_recover_spin); if (block_gen != ls->ls_recover_block || @@ -664,10 +659,10 @@ static void gfs2_control_func(struct work_struct *work) ls->ls_recover_result[i] = 0; - if (!test_bit_le(i, lvb_bits + JID_BITMAP_OFFSET)) + if (!test_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET)) continue; - __clear_bit_le(i, lvb_bits + JID_BITMAP_OFFSET); + __clear_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET); write_lvb = 1; } } @@ -691,7 +686,7 @@ static void gfs2_control_func(struct work_struct *work) continue; if (ls->ls_recover_submit[i] < start_gen) { ls->ls_recover_submit[i] = 0; - __set_bit_le(i, lvb_bits + JID_BITMAP_OFFSET); + __set_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET); } } /* even if there are no bits to set, we need to write the @@ -705,7 +700,7 @@ static void gfs2_control_func(struct work_struct *work) spin_unlock(&ls->ls_recover_spin); if (write_lvb) { - control_lvb_write(ls, start_gen, lvb_bits); + control_lvb_write(ls, start_gen, ls->ls_lvb_bits); flags = DLM_LKF_CONVERT | DLM_LKF_VALBLK; } else { flags = DLM_LKF_CONVERT; @@ -725,7 +720,7 @@ static void gfs2_control_func(struct work_struct *work) */ for (i = 0; i < recover_size; i++) { - if (test_bit_le(i, lvb_bits + JID_BITMAP_OFFSET)) { + if (test_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET)) { fs_info(sdp, "recover generation %u jid %d\n", start_gen, i); gfs2_recover_set(sdp, i); @@ -758,7 +753,6 @@ static void gfs2_control_func(struct work_struct *work) static int control_mount(struct gfs2_sbd *sdp) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; - char lvb_bits[GDLM_LVB_SIZE]; uint32_t start_gen, block_gen, mount_gen, lvb_gen; int mounted_mode; int retries = 0; @@ -857,7 +851,7 @@ locks_done: * lvb_gen will be non-zero. */ - control_lvb_read(ls, &lvb_gen, lvb_bits); + control_lvb_read(ls, &lvb_gen, ls->ls_lvb_bits); if (lvb_gen == 0xFFFFFFFF) { /* special value to force mount attempts to fail */ @@ -887,7 +881,7 @@ locks_done: * and all lvb bits to be clear (no pending journal recoveries.) */ - if (!all_jid_bits_clear(lvb_bits)) { + if (!all_jid_bits_clear(ls->ls_lvb_bits)) { /* journals need recovery, wait until all are clear */ fs_info(sdp, "control_mount wait for journal recovery\n"); goto restart; @@ -949,7 +943,6 @@ static int dlm_recovery_wait(void *word) static int control_first_done(struct gfs2_sbd *sdp) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; - char lvb_bits[GDLM_LVB_SIZE]; uint32_t start_gen, block_gen; int error; @@ -991,8 +984,8 @@ restart: memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t)); spin_unlock(&ls->ls_recover_spin); - memset(lvb_bits, 0, sizeof(lvb_bits)); - control_lvb_write(ls, start_gen, lvb_bits); + memset(ls->ls_lvb_bits, 0, GDLM_LVB_SIZE); + control_lvb_write(ls, start_gen, ls->ls_lvb_bits); error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT); if (error) @@ -1022,6 +1015,12 @@ static int set_recover_size(struct gfs2_sbd *sdp, struct dlm_slot *slots, uint32_t old_size, new_size; int i, max_jid; + if (!ls->ls_lvb_bits) { + ls->ls_lvb_bits = kzalloc(GDLM_LVB_SIZE, GFP_NOFS); + if (!ls->ls_lvb_bits) + return -ENOMEM; + } + max_jid = 0; for (i = 0; i < num_slots; i++) { if (max_jid < slots[i].slot - 1) @@ -1057,6 +1056,7 @@ static int set_recover_size(struct gfs2_sbd *sdp, struct dlm_slot *slots, static void free_recover_size(struct lm_lockstruct *ls) { + kfree(ls->ls_lvb_bits); kfree(ls->ls_recover_submit); kfree(ls->ls_recover_result); ls->ls_recover_submit = NULL; @@ -1205,6 +1205,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table) ls->ls_recover_size = 0; ls->ls_recover_submit = NULL; ls->ls_recover_result = NULL; + ls->ls_lvb_bits = NULL; error = set_recover_size(sdp, NULL, 0); if (error) diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 9a2ca8be7647..b404f4853034 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -73,7 +73,7 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, void gfs2_remove_from_ail(struct gfs2_bufdata *bd) { - bd->bd_ail = NULL; + bd->bd_tr = NULL; list_del_init(&bd->bd_ail_st_list); list_del_init(&bd->bd_ail_gl_list); atomic_dec(&bd->bd_gl->gl_ail_count); @@ -90,7 +90,7 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd) static int gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct writeback_control *wbc, - struct gfs2_ail *ai) + struct gfs2_trans *tr) __releases(&sdp->sd_ail_lock) __acquires(&sdp->sd_ail_lock) { @@ -99,15 +99,15 @@ __acquires(&sdp->sd_ail_lock) struct gfs2_bufdata *bd, *s; struct buffer_head *bh; - list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list, bd_ail_st_list) { + list_for_each_entry_safe_reverse(bd, s, &tr->tr_ail1_list, bd_ail_st_list) { bh = bd->bd_bh; - gfs2_assert(sdp, bd->bd_ail == ai); + gfs2_assert(sdp, bd->bd_tr == tr); if (!buffer_busy(bh)) { if (!buffer_uptodate(bh)) gfs2_io_error_bh(sdp, bh); - list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list); + list_move(&bd->bd_ail_st_list, &tr->tr_ail2_list); continue; } @@ -116,7 +116,7 @@ __acquires(&sdp->sd_ail_lock) if (gl == bd->bd_gl) continue; gl = bd->bd_gl; - list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list); + list_move(&bd->bd_ail_st_list, &tr->tr_ail1_list); mapping = bh->b_page->mapping; if (!mapping) continue; @@ -144,15 +144,15 @@ __acquires(&sdp->sd_ail_lock) void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc) { struct list_head *head = &sdp->sd_ail1_list; - struct gfs2_ail *ai; + struct gfs2_trans *tr; trace_gfs2_ail_flush(sdp, wbc, 1); spin_lock(&sdp->sd_ail_lock); restart: - list_for_each_entry_reverse(ai, head, ai_list) { + list_for_each_entry_reverse(tr, head, tr_list) { if (wbc->nr_to_write <= 0) break; - if (gfs2_ail1_start_one(sdp, wbc, ai)) + if (gfs2_ail1_start_one(sdp, wbc, tr)) goto restart; } spin_unlock(&sdp->sd_ail_lock); @@ -183,20 +183,20 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp) * */ -static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { struct gfs2_bufdata *bd, *s; struct buffer_head *bh; - list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list, + list_for_each_entry_safe_reverse(bd, s, &tr->tr_ail1_list, bd_ail_st_list) { bh = bd->bd_bh; - gfs2_assert(sdp, bd->bd_ail == ai); + gfs2_assert(sdp, bd->bd_tr == tr); if (buffer_busy(bh)) continue; if (!buffer_uptodate(bh)) gfs2_io_error_bh(sdp, bh); - list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list); + list_move(&bd->bd_ail_st_list, &tr->tr_ail2_list); } } @@ -210,14 +210,14 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai) static int gfs2_ail1_empty(struct gfs2_sbd *sdp) { - struct gfs2_ail *ai, *s; + struct gfs2_trans *tr, *s; int ret; spin_lock(&sdp->sd_ail_lock); - list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) { - gfs2_ail1_empty_one(sdp, ai); - if (list_empty(&ai->ai_ail1_list)) - list_move(&ai->ai_list, &sdp->sd_ail2_list); + list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) { + gfs2_ail1_empty_one(sdp, tr); + if (list_empty(&tr->tr_ail1_list)) + list_move(&tr->tr_list, &sdp->sd_ail2_list); else break; } @@ -229,13 +229,13 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp) static void gfs2_ail1_wait(struct gfs2_sbd *sdp) { - struct gfs2_ail *ai; + struct gfs2_trans *tr; struct gfs2_bufdata *bd; struct buffer_head *bh; spin_lock(&sdp->sd_ail_lock); - list_for_each_entry_reverse(ai, &sdp->sd_ail1_list, ai_list) { - list_for_each_entry(bd, &ai->ai_ail1_list, bd_ail_st_list) { + list_for_each_entry_reverse(tr, &sdp->sd_ail1_list, tr_list) { + list_for_each_entry(bd, &tr->tr_ail1_list, bd_ail_st_list) { bh = bd->bd_bh; if (!buffer_locked(bh)) continue; @@ -256,40 +256,40 @@ static void gfs2_ail1_wait(struct gfs2_sbd *sdp) * */ -static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { - struct list_head *head = &ai->ai_ail2_list; + struct list_head *head = &tr->tr_ail2_list; struct gfs2_bufdata *bd; while (!list_empty(head)) { bd = list_entry(head->prev, struct gfs2_bufdata, bd_ail_st_list); - gfs2_assert(sdp, bd->bd_ail == ai); + gfs2_assert(sdp, bd->bd_tr == tr); gfs2_remove_from_ail(bd); } } static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail) { - struct gfs2_ail *ai, *safe; + struct gfs2_trans *tr, *safe; unsigned int old_tail = sdp->sd_log_tail; int wrap = (new_tail < old_tail); int a, b, rm; spin_lock(&sdp->sd_ail_lock); - list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) { - a = (old_tail <= ai->ai_first); - b = (ai->ai_first < new_tail); + list_for_each_entry_safe(tr, safe, &sdp->sd_ail2_list, tr_list) { + a = (old_tail <= tr->tr_first); + b = (tr->tr_first < new_tail); rm = (wrap) ? (a || b) : (a && b); if (!rm) continue; - gfs2_ail2_empty_one(sdp, ai); - list_del(&ai->ai_list); - gfs2_assert_warn(sdp, list_empty(&ai->ai_ail1_list)); - gfs2_assert_warn(sdp, list_empty(&ai->ai_ail2_list)); - kfree(ai); + gfs2_ail2_empty_one(sdp, tr); + list_del(&tr->tr_list); + gfs2_assert_warn(sdp, list_empty(&tr->tr_ail1_list)); + gfs2_assert_warn(sdp, list_empty(&tr->tr_ail2_list)); + kfree(tr); } spin_unlock(&sdp->sd_ail_lock); @@ -435,7 +435,7 @@ static unsigned int calc_reserved(struct gfs2_sbd *sdp) static unsigned int current_tail(struct gfs2_sbd *sdp) { - struct gfs2_ail *ai; + struct gfs2_trans *tr; unsigned int tail; spin_lock(&sdp->sd_ail_lock); @@ -443,8 +443,9 @@ static unsigned int current_tail(struct gfs2_sbd *sdp) if (list_empty(&sdp->sd_ail1_list)) { tail = sdp->sd_log_head; } else { - ai = list_entry(sdp->sd_ail1_list.prev, struct gfs2_ail, ai_list); - tail = ai->ai_first; + tr = list_entry(sdp->sd_ail1_list.prev, struct gfs2_trans, + tr_list); + tail = tr->tr_first; } spin_unlock(&sdp->sd_ail_lock); @@ -600,7 +601,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags) void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) { - struct gfs2_ail *ai; + struct gfs2_trans *tr; down_write(&sdp->sd_log_flush_lock); @@ -611,9 +612,12 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) } trace_gfs2_log_flush(sdp, 1); - ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL); - INIT_LIST_HEAD(&ai->ai_ail1_list); - INIT_LIST_HEAD(&ai->ai_ail2_list); + tr = sdp->sd_log_tr; + if (tr) { + sdp->sd_log_tr = NULL; + INIT_LIST_HEAD(&tr->tr_ail1_list); + INIT_LIST_HEAD(&tr->tr_ail2_list); + } if (sdp->sd_log_num_buf != sdp->sd_log_commited_buf) { printk(KERN_INFO "GFS2: log buf %u %u\n", sdp->sd_log_num_buf, @@ -630,7 +634,8 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) sdp->sd_log_flush_head = sdp->sd_log_head; sdp->sd_log_flush_wrapped = 0; - ai->ai_first = sdp->sd_log_flush_head; + if (tr) + tr->tr_first = sdp->sd_log_flush_head; gfs2_ordered_write(sdp); lops_before_commit(sdp); @@ -643,7 +648,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) trace_gfs2_log_blocks(sdp, -1); log_write_header(sdp, 0); } - lops_after_commit(sdp, ai); + lops_after_commit(sdp, tr); gfs2_log_lock(sdp); sdp->sd_log_head = sdp->sd_log_flush_head; @@ -653,16 +658,16 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) sdp->sd_log_commited_revoke = 0; spin_lock(&sdp->sd_ail_lock); - if (!list_empty(&ai->ai_ail1_list)) { - list_add(&ai->ai_list, &sdp->sd_ail1_list); - ai = NULL; + if (tr && !list_empty(&tr->tr_ail1_list)) { + list_add(&tr->tr_list, &sdp->sd_ail1_list); + tr = NULL; } spin_unlock(&sdp->sd_ail_lock); gfs2_log_unlock(sdp); trace_gfs2_log_flush(sdp, 0); up_write(&sdp->sd_log_flush_lock); - kfree(ai); + kfree(tr); } static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) @@ -687,6 +692,12 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) sdp->sd_jdesc->jd_blocks); sdp->sd_log_blks_reserved = reserved; + if (sdp->sd_log_tr == NULL && + (tr->tr_num_buf_new || tr->tr_num_databuf_new)) { + gfs2_assert_withdraw(sdp, tr->tr_t_gh.gh_gl); + sdp->sd_log_tr = tr; + tr->tr_attached = 1; + } gfs2_log_unlock(sdp); } @@ -708,7 +719,6 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { log_refund(sdp, tr); - up_read(&sdp->sd_log_flush_lock); if (atomic_read(&sdp->sd_log_pinned) > atomic_read(&sdp->sd_log_thresh1) || ((sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free)) > diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index a5055977a214..c5fa758fd844 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -53,8 +53,8 @@ void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh) * to in-place disk block, remove it from the AIL. */ spin_lock(&sdp->sd_ail_lock); - if (bd->bd_ail) - list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list); + if (bd->bd_tr) + list_move(&bd->bd_ail_st_list, &bd->bd_tr->tr_ail2_list); spin_unlock(&sdp->sd_ail_lock); get_bh(bh); atomic_inc(&sdp->sd_log_pinned); @@ -94,7 +94,7 @@ static void maybe_release_space(struct gfs2_bufdata *bd) */ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, - struct gfs2_ail *ai) + struct gfs2_trans *tr) { struct gfs2_bufdata *bd = bh->b_private; @@ -109,7 +109,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, maybe_release_space(bd); spin_lock(&sdp->sd_ail_lock); - if (bd->bd_ail) { + if (bd->bd_tr) { list_del(&bd->bd_ail_st_list); brelse(bh); } else { @@ -117,8 +117,8 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list); atomic_inc(&gl->gl_ail_count); } - bd->bd_ail = ai; - list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list); + bd->bd_tr = tr; + list_add(&bd->bd_ail_st_list, &tr->tr_ail1_list); spin_unlock(&sdp->sd_ail_lock); clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); @@ -300,7 +300,7 @@ static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno) u64 nblk; if (bio) { - nblk = bio->bi_sector + bio_sectors(bio); + nblk = bio_end_sector(bio); nblk >>= sdp->sd_fsb2bb_shift; if (blkno == nblk) return bio; @@ -480,17 +480,22 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp) &sdp->sd_log_le_buf, 0); } -static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { struct list_head *head = &sdp->sd_log_le_buf; struct gfs2_bufdata *bd; + if (tr == NULL) { + gfs2_assert(sdp, list_empty(head)); + return; + } + while (!list_empty(head)) { bd = list_entry(head->next, struct gfs2_bufdata, bd_list); list_del_init(&bd->bd_list); sdp->sd_log_num_buf--; - gfs2_unpin(sdp, bd->bd_bh, ai); + gfs2_unpin(sdp, bd->bd_bh, tr); } gfs2_assert_warn(sdp, !sdp->sd_log_num_buf); } @@ -613,7 +618,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp) gfs2_log_write_page(sdp, page); } -static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { struct list_head *head = &sdp->sd_log_le_revoke; struct gfs2_bufdata *bd; @@ -791,16 +796,21 @@ static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks); } -static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { struct list_head *head = &sdp->sd_log_le_databuf; struct gfs2_bufdata *bd; + if (tr == NULL) { + gfs2_assert(sdp, list_empty(head)); + return; + } + while (!list_empty(head)) { bd = list_entry(head->next, struct gfs2_bufdata, bd_list); list_del_init(&bd->bd_list); sdp->sd_log_num_databuf--; - gfs2_unpin(sdp, bd->bd_bh, ai); + gfs2_unpin(sdp, bd->bd_bh, tr); } gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf); } diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index ba77b7da8325..87e062e05c92 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -55,12 +55,13 @@ static inline void lops_before_commit(struct gfs2_sbd *sdp) gfs2_log_ops[x]->lo_before_commit(sdp); } -static inline void lops_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +static inline void lops_after_commit(struct gfs2_sbd *sdp, + struct gfs2_trans *tr) { int x; for (x = 0; gfs2_log_ops[x]; x++) if (gfs2_log_ops[x]->lo_after_commit) - gfs2_log_ops[x]->lo_after_commit(sdp, ai); + gfs2_log_ops[x]->lo_after_commit(sdp, tr); } static inline void lops_before_scan(struct gfs2_jdesc *jd, diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index b059bbb5059e..1a89afb68472 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -295,7 +295,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int } if (bd) { spin_lock(&sdp->sd_ail_lock); - if (bd->bd_ail) { + if (bd->bd_tr) { gfs2_remove_from_ail(bd); bh->b_private = NULL; bd->bd_bh = NULL; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 1b612be4b873..60ede2a0f43f 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -20,6 +20,7 @@ #include <linux/gfs2_ondisk.h> #include <linux/quotaops.h> #include <linux/lockdep.h> +#include <linux/module.h> #include "gfs2.h" #include "incore.h" @@ -1425,6 +1426,7 @@ struct file_system_type gfs2_fs_type = { .kill_sb = gfs2_kill_sb, .owner = THIS_MODULE, }; +MODULE_ALIAS_FS("gfs2"); struct file_system_type gfs2meta_fs_type = { .name = "gfs2meta", @@ -1432,4 +1434,4 @@ struct file_system_type gfs2meta_fs_type = { .mount = gfs2_mount_meta, .owner = THIS_MODULE, }; - +MODULE_ALIAS_FS("gfs2meta"); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index d1f51fd73f86..0c5a575b513e 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -576,7 +576,7 @@ int gfs2_rs_alloc(struct gfs2_inode *ip) RB_CLEAR_NODE(&ip->i_res->rs_node); out: up_write(&ip->i_rw_mutex); - return 0; + return error; } static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs) @@ -592,7 +592,7 @@ static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs) * @rs: The reservation to remove * */ -static void __rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs) +static void __rs_deltree(struct gfs2_blkreserv *rs) { struct gfs2_rgrpd *rgd; @@ -605,7 +605,7 @@ static void __rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs) RB_CLEAR_NODE(&rs->rs_node); if (rs->rs_free) { - /* return reserved blocks to the rgrp and the ip */ + /* return reserved blocks to the rgrp */ BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free); rs->rs_rbm.rgd->rd_reserved -= rs->rs_free; rs->rs_free = 0; @@ -619,14 +619,14 @@ static void __rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs) * @rs: The reservation to remove * */ -void gfs2_rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs) +void gfs2_rs_deltree(struct gfs2_blkreserv *rs) { struct gfs2_rgrpd *rgd; rgd = rs->rs_rbm.rgd; if (rgd) { spin_lock(&rgd->rd_rsspin); - __rs_deltree(ip, rs); + __rs_deltree(rs); spin_unlock(&rgd->rd_rsspin); } } @@ -640,7 +640,7 @@ void gfs2_rs_delete(struct gfs2_inode *ip) { down_write(&ip->i_rw_mutex); if (ip->i_res) { - gfs2_rs_deltree(ip, ip->i_res); + gfs2_rs_deltree(ip->i_res); BUG_ON(ip->i_res->rs_free); kmem_cache_free(gfs2_rsrv_cachep, ip->i_res); ip->i_res = NULL; @@ -664,7 +664,7 @@ static void return_all_reservations(struct gfs2_rgrpd *rgd) spin_lock(&rgd->rd_rsspin); while ((n = rb_first(&rgd->rd_rstree))) { rs = rb_entry(n, struct gfs2_blkreserv, rs_node); - __rs_deltree(NULL, rs); + __rs_deltree(rs); } spin_unlock(&rgd->rd_rsspin); } @@ -1181,12 +1181,9 @@ int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed) { struct super_block *sb = sdp->sd_vfs; - struct block_device *bdev = sb->s_bdev; - const unsigned int sects_per_blk = sdp->sd_sb.sb_bsize / - bdev_logical_block_size(sb->s_bdev); u64 blk; sector_t start = 0; - sector_t nr_sects = 0; + sector_t nr_blks = 0; int rv; unsigned int x; u32 trimmed = 0; @@ -1206,35 +1203,34 @@ int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, if (diff == 0) continue; blk = offset + ((bi->bi_start + x) * GFS2_NBBY); - blk *= sects_per_blk; /* convert to sectors */ while(diff) { if (diff & 1) { - if (nr_sects == 0) + if (nr_blks == 0) goto start_new_extent; - if ((start + nr_sects) != blk) { - if (nr_sects >= minlen) { - rv = blkdev_issue_discard(bdev, - start, nr_sects, + if ((start + nr_blks) != blk) { + if (nr_blks >= minlen) { + rv = sb_issue_discard(sb, + start, nr_blks, GFP_NOFS, 0); if (rv) goto fail; - trimmed += nr_sects; + trimmed += nr_blks; } - nr_sects = 0; + nr_blks = 0; start_new_extent: start = blk; } - nr_sects += sects_per_blk; + nr_blks++; } diff >>= 2; - blk += sects_per_blk; + blk++; } } - if (nr_sects >= minlen) { - rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 0); + if (nr_blks >= minlen) { + rv = sb_issue_discard(sb, start, nr_blks, GFP_NOFS, 0); if (rv) goto fail; - trimmed += nr_sects; + trimmed += nr_blks; } if (ptrimmed) *ptrimmed = trimmed; @@ -1878,7 +1874,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 aflags) /* Drop reservation, if we couldn't use reserved rgrp */ if (gfs2_rs_active(rs)) - gfs2_rs_deltree(ip, rs); + gfs2_rs_deltree(rs); check_rgrp: /* Check for unlinked inodes which can be reclaimed */ if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK) @@ -2091,7 +2087,7 @@ static void gfs2_adjust_reservation(struct gfs2_inode *ip, if (rs->rs_free && !ret) goto out; } - __rs_deltree(ip, rs); + __rs_deltree(rs); } out: spin_unlock(&rgd->rd_rsspin); @@ -2184,13 +2180,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, if (dinode) gfs2_trans_add_unrevoke(sdp, block, 1); - /* - * This needs reviewing to see why we cannot do the quota change - * at this point in the dinode case. - */ - if (ndata) - gfs2_quota_change(ip, ndata, ip->i_inode.i_uid, - ip->i_inode.i_gid); + gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid); rbm.rgd->rd_free_clone -= *nblocks; trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks, diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 842185853f6b..5b3f4a896e6c 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -47,7 +47,7 @@ extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n, bool dinode, u64 *generation); extern int gfs2_rs_alloc(struct gfs2_inode *ip); -extern void gfs2_rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs); +extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs); extern void gfs2_rs_delete(struct gfs2_inode *ip); extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta); extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index cab77b8ba84f..917c8e1eb4ae 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1512,7 +1512,7 @@ out_truncate: out_unlock: /* Error path for case 1 */ if (gfs2_rs_active(ip->i_res)) - gfs2_rs_deltree(ip, ip->i_res); + gfs2_rs_deltree(ip->i_res); if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) gfs2_glock_dq(&ip->i_iopen_gh); diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index 2ee13e841e9f..20c007d747ab 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -159,9 +159,9 @@ TRACE_EVENT(gfs2_glock_put, /* Callback (local or remote) requesting lock demotion */ TRACE_EVENT(gfs2_demote_rq, - TP_PROTO(const struct gfs2_glock *gl), + TP_PROTO(const struct gfs2_glock *gl, bool remote), - TP_ARGS(gl), + TP_ARGS(gl, remote), TP_STRUCT__entry( __field( dev_t, dev ) @@ -170,6 +170,7 @@ TRACE_EVENT(gfs2_demote_rq, __field( u8, cur_state ) __field( u8, dmt_state ) __field( unsigned long, flags ) + __field( bool, remote ) ), TP_fast_assign( @@ -179,14 +180,16 @@ TRACE_EVENT(gfs2_demote_rq, __entry->cur_state = glock_trace_state(gl->gl_state); __entry->dmt_state = glock_trace_state(gl->gl_demote_state); __entry->flags = gl->gl_flags | (gl->gl_object ? (1UL<<GLF_OBJECT) : 0); + __entry->remote = remote; ), - TP_printk("%u,%u glock %d:%lld demote %s to %s flags:%s", + TP_printk("%u,%u glock %d:%lld demote %s to %s flags:%s %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype, (unsigned long long)__entry->glnum, glock_trace_name(__entry->cur_state), glock_trace_name(__entry->dmt_state), - show_glock_flags(__entry->flags)) + show_glock_flags(__entry->flags), + __entry->remote ? "remote" : "local") ); diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 88162fae27a5..7374907742a8 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -96,7 +96,8 @@ static void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks) static void gfs2_print_trans(const struct gfs2_trans *tr) { - print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip); + printk(KERN_WARNING "GFS2: Transaction created at: %pSR\n", + (void *)tr->tr_ip); printk(KERN_WARNING "GFS2: blocks=%u revokes=%u reserved=%u touched=%d\n", tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, tr->tr_touched); printk(KERN_WARNING "GFS2: Buf %u/%u Databuf %u/%u Revoke %u/%u\n", @@ -135,8 +136,10 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) if (tr->tr_t_gh.gh_gl) { gfs2_glock_dq(&tr->tr_t_gh); gfs2_holder_uninit(&tr->tr_t_gh); - kfree(tr); + if (!tr->tr_attached) + kfree(tr); } + up_read(&sdp->sd_log_flush_lock); if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS) gfs2_log_flush(sdp, NULL); diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c index 571abe97b42a..de69d8a24f6d 100644 --- a/fs/hfs/bfind.c +++ b/fs/hfs/bfind.c @@ -22,7 +22,8 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) return -ENOMEM; fd->search_key = ptr; fd->key = ptr + tree->max_key_len + 2; - dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); + hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n", + tree->cnid, __builtin_return_address(0)); mutex_lock(&tree->tree_lock); return 0; } @@ -31,7 +32,8 @@ void hfs_find_exit(struct hfs_find_data *fd) { hfs_bnode_put(fd->bnode); kfree(fd->search_key); - dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); + hfs_dbg(BNODE_REFS, "find_exit: %d (%p)\n", + fd->tree->cnid, __builtin_return_address(0)); mutex_unlock(&fd->tree->tree_lock); fd->tree = NULL; } @@ -135,8 +137,8 @@ int hfs_brec_find(struct hfs_find_data *fd) return res; invalid: - printk(KERN_ERR "hfs: inconsistency in B*Tree (%d,%d,%d,%u,%u)\n", - height, bnode->height, bnode->type, nidx, parent); + pr_err("inconsistency in B*Tree (%d,%d,%d,%u,%u)\n", + height, bnode->height, bnode->type, nidx, parent); res = -EIO; release: hfs_bnode_put(bnode); diff --git a/fs/hfs/bitmap.c b/fs/hfs/bitmap.c index c6e97366e8ac..28307bc9ec1e 100644 --- a/fs/hfs/bitmap.c +++ b/fs/hfs/bitmap.c @@ -158,7 +158,7 @@ u32 hfs_vbm_search_free(struct super_block *sb, u32 goal, u32 *num_bits) } } - dprint(DBG_BITMAP, "alloc_bits: %u,%u\n", pos, *num_bits); + hfs_dbg(BITMAP, "alloc_bits: %u,%u\n", pos, *num_bits); HFS_SB(sb)->free_ablocks -= *num_bits; hfs_bitmap_dirty(sb); out: @@ -200,7 +200,7 @@ int hfs_clear_vbm_bits(struct super_block *sb, u16 start, u16 count) if (!count) return 0; - dprint(DBG_BITMAP, "clear_bits: %u,%u\n", start, count); + hfs_dbg(BITMAP, "clear_bits: %u,%u\n", start, count); /* are all of the bits in range? */ if ((start + count) > HFS_SB(sb)->fs_ablocks) return -2; diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index cdb41a1f6a64..f3b1a15ccd59 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -100,7 +100,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, struct hfs_btree *tree; struct page *src_page, *dst_page; - dprint(DBG_BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); + hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); if (!len) return; tree = src_node->tree; @@ -120,7 +120,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) struct page *page; void *ptr; - dprint(DBG_BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); + hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); if (!len) return; src += node->page_offset; @@ -138,16 +138,16 @@ void hfs_bnode_dump(struct hfs_bnode *node) __be32 cnid; int i, off, key_off; - dprint(DBG_BNODE_MOD, "bnode: %d\n", node->this); + hfs_dbg(BNODE_MOD, "bnode: %d\n", node->this); hfs_bnode_read(node, &desc, 0, sizeof(desc)); - dprint(DBG_BNODE_MOD, "%d, %d, %d, %d, %d\n", + hfs_dbg(BNODE_MOD, "%d, %d, %d, %d, %d\n", be32_to_cpu(desc.next), be32_to_cpu(desc.prev), desc.type, desc.height, be16_to_cpu(desc.num_recs)); off = node->tree->node_size - 2; for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) { key_off = hfs_bnode_read_u16(node, off); - dprint(DBG_BNODE_MOD, " %d", key_off); + hfs_dbg_cont(BNODE_MOD, " %d", key_off); if (i && node->type == HFS_NODE_INDEX) { int tmp; @@ -155,17 +155,18 @@ void hfs_bnode_dump(struct hfs_bnode *node) tmp = (hfs_bnode_read_u8(node, key_off) | 1) + 1; else tmp = node->tree->max_key_len + 1; - dprint(DBG_BNODE_MOD, " (%d,%d", tmp, hfs_bnode_read_u8(node, key_off)); + hfs_dbg_cont(BNODE_MOD, " (%d,%d", + tmp, hfs_bnode_read_u8(node, key_off)); hfs_bnode_read(node, &cnid, key_off + tmp, 4); - dprint(DBG_BNODE_MOD, ",%d)", be32_to_cpu(cnid)); + hfs_dbg_cont(BNODE_MOD, ",%d)", be32_to_cpu(cnid)); } else if (i && node->type == HFS_NODE_LEAF) { int tmp; tmp = hfs_bnode_read_u8(node, key_off); - dprint(DBG_BNODE_MOD, " (%d)", tmp); + hfs_dbg_cont(BNODE_MOD, " (%d)", tmp); } } - dprint(DBG_BNODE_MOD, "\n"); + hfs_dbg_cont(BNODE_MOD, "\n"); } void hfs_bnode_unlink(struct hfs_bnode *node) @@ -220,7 +221,7 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid) struct hfs_bnode *node; if (cnid >= tree->node_count) { - printk(KERN_ERR "hfs: request for non-existent node %d in B*Tree\n", cnid); + pr_err("request for non-existent node %d in B*Tree\n", cnid); return NULL; } @@ -243,7 +244,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) loff_t off; if (cnid >= tree->node_count) { - printk(KERN_ERR "hfs: request for non-existent node %d in B*Tree\n", cnid); + pr_err("request for non-existent node %d in B*Tree\n", cnid); return NULL; } @@ -257,8 +258,8 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) node->this = cnid; set_bit(HFS_BNODE_NEW, &node->flags); atomic_set(&node->refcnt, 1); - dprint(DBG_BNODE_REFS, "new_node(%d:%d): 1\n", - node->tree->cnid, node->this); + hfs_dbg(BNODE_REFS, "new_node(%d:%d): 1\n", + node->tree->cnid, node->this); init_waitqueue_head(&node->lock_wq); spin_lock(&tree->hash_lock); node2 = hfs_bnode_findhash(tree, cnid); @@ -301,7 +302,7 @@ void hfs_bnode_unhash(struct hfs_bnode *node) { struct hfs_bnode **p; - dprint(DBG_BNODE_REFS, "remove_node(%d:%d): %d\n", + hfs_dbg(BNODE_REFS, "remove_node(%d:%d): %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)]; *p && *p != node; p = &(*p)->next_hash) @@ -443,8 +444,9 @@ void hfs_bnode_get(struct hfs_bnode *node) { if (node) { atomic_inc(&node->refcnt); - dprint(DBG_BNODE_REFS, "get_node(%d:%d): %d\n", - node->tree->cnid, node->this, atomic_read(&node->refcnt)); + hfs_dbg(BNODE_REFS, "get_node(%d:%d): %d\n", + node->tree->cnid, node->this, + atomic_read(&node->refcnt)); } } @@ -455,8 +457,9 @@ void hfs_bnode_put(struct hfs_bnode *node) struct hfs_btree *tree = node->tree; int i; - dprint(DBG_BNODE_REFS, "put_node(%d:%d): %d\n", - node->tree->cnid, node->this, atomic_read(&node->refcnt)); + hfs_dbg(BNODE_REFS, "put_node(%d:%d): %d\n", + node->tree->cnid, node->this, + atomic_read(&node->refcnt)); BUG_ON(!atomic_read(&node->refcnt)); if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock)) return; diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c index 92fb358ce824..9f4ee7f52026 100644 --- a/fs/hfs/brec.c +++ b/fs/hfs/brec.c @@ -47,15 +47,13 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec) if (node->tree->attributes & HFS_TREE_BIGKEYS) { retval = hfs_bnode_read_u16(node, recoff) + 2; if (retval > node->tree->max_key_len + 2) { - printk(KERN_ERR "hfs: keylen %d too large\n", - retval); + pr_err("keylen %d too large\n", retval); retval = 0; } } else { retval = (hfs_bnode_read_u8(node, recoff) | 1) + 1; if (retval > node->tree->max_key_len + 1) { - printk(KERN_ERR "hfs: keylen %d too large\n", - retval); + pr_err("keylen %d too large\n", retval); retval = 0; } } @@ -94,7 +92,8 @@ again: end_rec_off = tree->node_size - (node->num_recs + 1) * 2; end_off = hfs_bnode_read_u16(node, end_rec_off); end_rec_off -= 2; - dprint(DBG_BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", rec, size, end_off, end_rec_off); + hfs_dbg(BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", + rec, size, end_off, end_rec_off); if (size > end_rec_off - end_off) { if (new_node) panic("not enough room!\n"); @@ -190,7 +189,8 @@ again: mark_inode_dirty(tree->inode); } hfs_bnode_dump(node); - dprint(DBG_BNODE_MOD, "remove_rec: %d, %d\n", fd->record, fd->keylength + fd->entrylength); + hfs_dbg(BNODE_MOD, "remove_rec: %d, %d\n", + fd->record, fd->keylength + fd->entrylength); if (!--node->num_recs) { hfs_bnode_unlink(node); if (!node->parent) @@ -240,7 +240,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) if (IS_ERR(new_node)) return new_node; hfs_bnode_get(node); - dprint(DBG_BNODE_MOD, "split_nodes: %d - %d - %d\n", + hfs_dbg(BNODE_MOD, "split_nodes: %d - %d - %d\n", node->this, new_node->this, node->next); new_node->next = node->next; new_node->prev = node->this; @@ -374,7 +374,8 @@ again: newkeylen = (hfs_bnode_read_u8(node, 14) | 1) + 1; else fd->keylength = newkeylen = tree->max_key_len + 1; - dprint(DBG_BNODE_MOD, "update_rec: %d, %d, %d\n", rec, fd->keylength, newkeylen); + hfs_dbg(BNODE_MOD, "update_rec: %d, %d, %d\n", + rec, fd->keylength, newkeylen); rec_off = tree->node_size - (rec + 2) * 2; end_rec_off = tree->node_size - (parent->num_recs + 1) * 2; @@ -385,7 +386,7 @@ again: end_off = hfs_bnode_read_u16(parent, end_rec_off); if (end_rec_off - end_off < diff) { - printk(KERN_DEBUG "hfs: splitting index node...\n"); + printk(KERN_DEBUG "splitting index node...\n"); fd->bnode = parent; new_node = hfs_bnode_split(fd); if (IS_ERR(new_node)) diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 1cbdeea1db44..1ab19e660e69 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -48,7 +48,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke mdb->drXTFlSize, be32_to_cpu(mdb->drXTClpSiz)); if (HFS_I(tree->inode)->alloc_blocks > HFS_I(tree->inode)->first_blocks) { - printk(KERN_ERR "hfs: invalid btree extent records\n"); + pr_err("invalid btree extent records\n"); unlock_new_inode(tree->inode); goto free_inode; } @@ -60,8 +60,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke mdb->drCTFlSize, be32_to_cpu(mdb->drCTClpSiz)); if (!HFS_I(tree->inode)->first_blocks) { - printk(KERN_ERR "hfs: invalid btree extent records " - "(0 size).\n"); + pr_err("invalid btree extent records (0 size)\n"); unlock_new_inode(tree->inode); goto free_inode; } @@ -100,15 +99,15 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke switch (id) { case HFS_EXT_CNID: if (tree->max_key_len != HFS_MAX_EXT_KEYLEN) { - printk(KERN_ERR "hfs: invalid extent max_key_len %d\n", - tree->max_key_len); + pr_err("invalid extent max_key_len %d\n", + tree->max_key_len); goto fail_page; } break; case HFS_CAT_CNID: if (tree->max_key_len != HFS_MAX_CAT_KEYLEN) { - printk(KERN_ERR "hfs: invalid catalog max_key_len %d\n", - tree->max_key_len); + pr_err("invalid catalog max_key_len %d\n", + tree->max_key_len); goto fail_page; } break; @@ -146,8 +145,9 @@ void hfs_btree_close(struct hfs_btree *tree) while ((node = tree->node_hash[i])) { tree->node_hash[i] = node->next_hash; if (atomic_read(&node->refcnt)) - printk(KERN_ERR "hfs: node %d:%d still has %d user(s)!\n", - node->tree->cnid, node->this, atomic_read(&node->refcnt)); + pr_err("node %d:%d still has %d user(s)!\n", + node->tree->cnid, node->this, + atomic_read(&node->refcnt)); hfs_bnode_free(node); tree->node_hash_cnt--; } @@ -290,7 +290,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) kunmap(*pagep); nidx = node->next; if (!nidx) { - printk(KERN_DEBUG "hfs: create new bmap node...\n"); + printk(KERN_DEBUG "create new bmap node...\n"); next_node = hfs_bmap_new_bmap(node, idx); } else next_node = hfs_bnode_find(tree, nidx); @@ -316,7 +316,7 @@ void hfs_bmap_free(struct hfs_bnode *node) u32 nidx; u8 *data, byte, m; - dprint(DBG_BNODE_MOD, "btree_free_node: %u\n", node->this); + hfs_dbg(BNODE_MOD, "btree_free_node: %u\n", node->this); tree = node->tree; nidx = node->this; node = hfs_bnode_find(tree, 0); @@ -331,7 +331,8 @@ void hfs_bmap_free(struct hfs_bnode *node) hfs_bnode_put(node); if (!i) { /* panic */; - printk(KERN_CRIT "hfs: unable to free bnode %u. bmap not found!\n", node->this); + pr_crit("unable to free bnode %u. bmap not found!\n", + node->this); return; } node = hfs_bnode_find(tree, i); @@ -339,7 +340,8 @@ void hfs_bmap_free(struct hfs_bnode *node) return; if (node->type != HFS_NODE_MAP) { /* panic */; - printk(KERN_CRIT "hfs: invalid bmap found! (%u,%d)\n", node->this, node->type); + pr_crit("invalid bmap found! (%u,%d)\n", + node->this, node->type); hfs_bnode_put(node); return; } @@ -352,7 +354,8 @@ void hfs_bmap_free(struct hfs_bnode *node) m = 1 << (~nidx & 7); byte = data[off]; if (!(byte & m)) { - printk(KERN_CRIT "hfs: trying to free free bnode %u(%d)\n", node->this, node->type); + pr_crit("trying to free free bnode %u(%d)\n", + node->this, node->type); kunmap(page); hfs_bnode_put(node); return; diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c index 424b0337f524..ff0316b925a5 100644 --- a/fs/hfs/catalog.c +++ b/fs/hfs/catalog.c @@ -87,12 +87,15 @@ int hfs_cat_create(u32 cnid, struct inode *dir, struct qstr *str, struct inode * int entry_size; int err; - dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink); + hfs_dbg(CAT_MOD, "create_cat: %s,%u(%d)\n", + str->name, cnid, inode->i_nlink); if (dir->i_size >= HFS_MAX_VALENCE) return -ENOSPC; sb = dir->i_sb; - hfs_find_init(HFS_SB(sb)->cat_tree, &fd); + err = hfs_find_init(HFS_SB(sb)->cat_tree, &fd); + if (err) + return err; hfs_cat_build_key(sb, fd.search_key, cnid, NULL); entry_size = hfs_cat_build_thread(sb, &entry, S_ISDIR(inode->i_mode) ? @@ -184,14 +187,14 @@ int hfs_cat_find_brec(struct super_block *sb, u32 cnid, type = rec.type; if (type != HFS_CDR_THD && type != HFS_CDR_FTH) { - printk(KERN_ERR "hfs: found bad thread record in catalog\n"); + pr_err("found bad thread record in catalog\n"); return -EIO; } fd->search_key->cat.ParID = rec.thread.ParID; len = fd->search_key->cat.CName.len = rec.thread.CName.len; if (len > HFS_NAMELEN) { - printk(KERN_ERR "hfs: bad catalog namelength\n"); + pr_err("bad catalog namelength\n"); return -EIO; } memcpy(fd->search_key->cat.CName.name, rec.thread.CName.name, len); @@ -212,9 +215,11 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, struct qstr *str) struct list_head *pos; int res, type; - dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); + hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); sb = dir->i_sb; - hfs_find_init(HFS_SB(sb)->cat_tree, &fd); + res = hfs_find_init(HFS_SB(sb)->cat_tree, &fd); + if (res) + return res; hfs_cat_build_key(sb, fd.search_key, dir->i_ino, str); res = hfs_brec_find(&fd); @@ -278,10 +283,13 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, struct qstr *src_name, int entry_size, type; int err; - dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name, + hfs_dbg(CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", + cnid, src_dir->i_ino, src_name->name, dst_dir->i_ino, dst_name->name); sb = src_dir->i_sb; - hfs_find_init(HFS_SB(sb)->cat_tree, &src_fd); + err = hfs_find_init(HFS_SB(sb)->cat_tree, &src_fd); + if (err) + return err; dst_fd = src_fd; /* find the old dir entry and read the data */ diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index 5f7f1abd5f6d..e0101b6fb0d7 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -25,7 +25,9 @@ static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry, struct inode *inode = NULL; int res; - hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd); + res = hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd); + if (res) + return ERR_PTR(res); hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name); res = hfs_brec_read(&fd, &rec, sizeof(rec)); if (res) { @@ -63,7 +65,9 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (filp->f_pos >= inode->i_size) return 0; - hfs_find_init(HFS_SB(sb)->cat_tree, &fd); + err = hfs_find_init(HFS_SB(sb)->cat_tree, &fd); + if (err) + return err; hfs_cat_build_key(sb, fd.search_key, inode->i_ino, NULL); err = hfs_brec_find(&fd); if (err) @@ -84,12 +88,12 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir) hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); if (entry.type != HFS_CDR_THD) { - printk(KERN_ERR "hfs: bad catalog folder thread\n"); + pr_err("bad catalog folder thread\n"); err = -EIO; goto out; } //if (fd.entrylength < HFS_MIN_THREAD_SZ) { - // printk(KERN_ERR "hfs: truncated catalog thread\n"); + // pr_err("truncated catalog thread\n"); // err = -EIO; // goto out; //} @@ -108,7 +112,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir) for (;;) { if (be32_to_cpu(fd.key->cat.ParID) != inode->i_ino) { - printk(KERN_ERR "hfs: walked past end of dir\n"); + pr_err("walked past end of dir\n"); err = -EIO; goto out; } @@ -123,7 +127,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir) len = hfs_mac2asc(sb, strbuf, &fd.key->cat.CName); if (type == HFS_CDR_DIR) { if (fd.entrylength < sizeof(struct hfs_cat_dir)) { - printk(KERN_ERR "hfs: small dir entry\n"); + pr_err("small dir entry\n"); err = -EIO; goto out; } @@ -132,7 +136,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir) break; } else if (type == HFS_CDR_FIL) { if (fd.entrylength < sizeof(struct hfs_cat_file)) { - printk(KERN_ERR "hfs: small file entry\n"); + pr_err("small file entry\n"); err = -EIO; goto out; } @@ -140,7 +144,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir) be32_to_cpu(entry.file.FlNum), DT_REG)) break; } else { - printk(KERN_ERR "hfs: bad catalog entry type %d\n", type); + pr_err("bad catalog entry type %d\n", type); err = -EIO; goto out; } @@ -172,7 +176,9 @@ static int hfs_dir_release(struct inode *inode, struct file *file) { struct hfs_readdir_data *rd = file->private_data; if (rd) { + mutex_lock(&inode->i_mutex); list_del(&rd->list); + mutex_unlock(&inode->i_mutex); kfree(rd); } return 0; diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c index a67955a0c36f..e33a0d36a93e 100644 --- a/fs/hfs/extent.c +++ b/fs/hfs/extent.c @@ -107,7 +107,7 @@ static u16 hfs_ext_lastblock(struct hfs_extent *ext) return be16_to_cpu(ext->block) + be16_to_cpu(ext->count); } -static void __hfs_ext_write_extent(struct inode *inode, struct hfs_find_data *fd) +static int __hfs_ext_write_extent(struct inode *inode, struct hfs_find_data *fd) { int res; @@ -116,26 +116,31 @@ static void __hfs_ext_write_extent(struct inode *inode, struct hfs_find_data *fd res = hfs_brec_find(fd); if (HFS_I(inode)->flags & HFS_FLG_EXT_NEW) { if (res != -ENOENT) - return; + return res; hfs_brec_insert(fd, HFS_I(inode)->cached_extents, sizeof(hfs_extent_rec)); HFS_I(inode)->flags &= ~(HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW); } else { if (res) - return; + return res; hfs_bnode_write(fd->bnode, HFS_I(inode)->cached_extents, fd->entryoffset, fd->entrylength); HFS_I(inode)->flags &= ~HFS_FLG_EXT_DIRTY; } + return 0; } -void hfs_ext_write_extent(struct inode *inode) +int hfs_ext_write_extent(struct inode *inode) { struct hfs_find_data fd; + int res = 0; if (HFS_I(inode)->flags & HFS_FLG_EXT_DIRTY) { - hfs_find_init(HFS_SB(inode->i_sb)->ext_tree, &fd); - __hfs_ext_write_extent(inode, &fd); + res = hfs_find_init(HFS_SB(inode->i_sb)->ext_tree, &fd); + if (res) + return res; + res = __hfs_ext_write_extent(inode, &fd); hfs_find_exit(&fd); } + return res; } static inline int __hfs_ext_read_extent(struct hfs_find_data *fd, struct hfs_extent *extent, @@ -161,8 +166,11 @@ static inline int __hfs_ext_cache_extent(struct hfs_find_data *fd, struct inode { int res; - if (HFS_I(inode)->flags & HFS_FLG_EXT_DIRTY) - __hfs_ext_write_extent(inode, fd); + if (HFS_I(inode)->flags & HFS_FLG_EXT_DIRTY) { + res = __hfs_ext_write_extent(inode, fd); + if (res) + return res; + } res = __hfs_ext_read_extent(fd, HFS_I(inode)->cached_extents, inode->i_ino, block, HFS_IS_RSRC(inode) ? HFS_FK_RSRC : HFS_FK_DATA); @@ -185,9 +193,11 @@ static int hfs_ext_read_extent(struct inode *inode, u16 block) block < HFS_I(inode)->cached_start + HFS_I(inode)->cached_blocks) return 0; - hfs_find_init(HFS_SB(inode->i_sb)->ext_tree, &fd); - res = __hfs_ext_cache_extent(&fd, inode, block); - hfs_find_exit(&fd); + res = hfs_find_init(HFS_SB(inode->i_sb)->ext_tree, &fd); + if (!res) { + res = __hfs_ext_cache_extent(&fd, inode, block); + hfs_find_exit(&fd); + } return res; } @@ -195,11 +205,12 @@ static void hfs_dump_extent(struct hfs_extent *extent) { int i; - dprint(DBG_EXTENT, " "); + hfs_dbg(EXTENT, " "); for (i = 0; i < 3; i++) - dprint(DBG_EXTENT, " %u:%u", be16_to_cpu(extent[i].block), - be16_to_cpu(extent[i].count)); - dprint(DBG_EXTENT, "\n"); + hfs_dbg_cont(EXTENT, " %u:%u", + be16_to_cpu(extent[i].block), + be16_to_cpu(extent[i].count)); + hfs_dbg_cont(EXTENT, "\n"); } static int hfs_add_extent(struct hfs_extent *extent, u16 offset, @@ -298,7 +309,9 @@ int hfs_free_fork(struct super_block *sb, struct hfs_cat_file *file, int type) if (total_blocks == blocks) return 0; - hfs_find_init(HFS_SB(sb)->ext_tree, &fd); + res = hfs_find_init(HFS_SB(sb)->ext_tree, &fd); + if (res) + return res; do { res = __hfs_ext_read_extent(&fd, extent, cnid, total_blocks, type); if (res) @@ -392,10 +405,10 @@ int hfs_extend_file(struct inode *inode) goto out; } - dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); + hfs_dbg(EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); if (HFS_I(inode)->alloc_blocks == HFS_I(inode)->first_blocks) { if (!HFS_I(inode)->first_blocks) { - dprint(DBG_EXTENT, "first extents\n"); + hfs_dbg(EXTENT, "first extents\n"); /* no extents yet */ HFS_I(inode)->first_extents[0].block = cpu_to_be16(start); HFS_I(inode)->first_extents[0].count = cpu_to_be16(len); @@ -437,8 +450,10 @@ out: return res; insert_extent: - dprint(DBG_EXTENT, "insert new extent\n"); - hfs_ext_write_extent(inode); + hfs_dbg(EXTENT, "insert new extent\n"); + res = hfs_ext_write_extent(inode); + if (res) + goto out; memset(HFS_I(inode)->cached_extents, 0, sizeof(hfs_extent_rec)); HFS_I(inode)->cached_extents[0].block = cpu_to_be16(start); @@ -460,13 +475,13 @@ void hfs_file_truncate(struct inode *inode) u32 size; int res; - dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n", inode->i_ino, - (long long)HFS_I(inode)->phys_size, inode->i_size); + hfs_dbg(INODE, "truncate: %lu, %Lu -> %Lu\n", + inode->i_ino, (long long)HFS_I(inode)->phys_size, + inode->i_size); if (inode->i_size > HFS_I(inode)->phys_size) { struct address_space *mapping = inode->i_mapping; void *fsdata; struct page *page; - int res; /* XXX: Can use generic_cont_expand? */ size = inode->i_size - 1; @@ -488,7 +503,12 @@ void hfs_file_truncate(struct inode *inode) goto out; mutex_lock(&HFS_I(inode)->extents_lock); - hfs_find_init(HFS_SB(sb)->ext_tree, &fd); + res = hfs_find_init(HFS_SB(sb)->ext_tree, &fd); + if (res) { + mutex_unlock(&HFS_I(inode)->extents_lock); + /* XXX: We lack error handling of hfs_file_truncate() */ + return; + } while (1) { if (alloc_cnt == HFS_I(inode)->first_blocks) { hfs_free_extents(sb, HFS_I(inode)->first_extents, diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 693df9fe52b2..a73b11839a41 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -9,6 +9,12 @@ #ifndef _LINUX_HFS_FS_H #define _LINUX_HFS_FS_H +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/slab.h> #include <linux/types.h> #include <linux/mutex.h> @@ -34,8 +40,18 @@ //#define DBG_MASK (DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT) #define DBG_MASK (0) -#define dprint(flg, fmt, args...) \ - if (flg & DBG_MASK) printk(fmt , ## args) +#define hfs_dbg(flg, fmt, ...) \ +do { \ + if (DBG_##flg & DBG_MASK) \ + printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ +} while (0) + +#define hfs_dbg_cont(flg, fmt, ...) \ +do { \ + if (DBG_##flg & DBG_MASK) \ + pr_cont(fmt, ##__VA_ARGS__); \ +} while (0) + /* * struct hfs_inode_info @@ -174,7 +190,7 @@ extern const struct inode_operations hfs_dir_inode_operations; /* extent.c */ extern int hfs_ext_keycmp(const btree_key *, const btree_key *); extern int hfs_free_fork(struct super_block *, struct hfs_cat_file *, int); -extern void hfs_ext_write_extent(struct inode *); +extern int hfs_ext_write_extent(struct inode *); extern int hfs_extend_file(struct inode *); extern void hfs_file_truncate(struct inode *); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 3031dfdd2358..f9299d8a64e3 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -14,6 +14,7 @@ #include <linux/pagemap.h> #include <linux/mpage.h> #include <linux/sched.h> +#include <linux/aio.h> #include "hfs_fs.h" #include "btree.h" @@ -237,7 +238,7 @@ void hfs_delete_inode(struct inode *inode) { struct super_block *sb = inode->i_sb; - dprint(DBG_INODE, "delete_inode: %lu\n", inode->i_ino); + hfs_dbg(INODE, "delete_inode: %lu\n", inode->i_ino); if (S_ISDIR(inode->i_mode)) { HFS_SB(sb)->folder_count--; if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID)) @@ -416,9 +417,12 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) struct inode *main_inode = inode; struct hfs_find_data fd; hfs_cat_rec rec; + int res; - dprint(DBG_INODE, "hfs_write_inode: %lu\n", inode->i_ino); - hfs_ext_write_extent(inode); + hfs_dbg(INODE, "hfs_write_inode: %lu\n", inode->i_ino); + res = hfs_ext_write_extent(inode); + if (res) + return res; if (inode->i_ino < HFS_FIRSTUSER_CNID) { switch (inode->i_ino) { @@ -515,7 +519,11 @@ static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry, if (!inode) return ERR_PTR(-ENOMEM); - hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd); + res = hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd); + if (res) { + iput(inode); + return ERR_PTR(res); + } fd.search_key->cat = HFS_I(dir)->cat_key; res = hfs_brec_read(&fd, &rec, sizeof(rec)); if (!res) { diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c index b7ec224910c5..aa3f0d6d043c 100644 --- a/fs/hfs/mdb.c +++ b/fs/hfs/mdb.c @@ -48,7 +48,7 @@ static int hfs_get_last_session(struct super_block *sb, *start = (sector_t)te.cdte_addr.lba << 2; return 0; } - printk(KERN_ERR "hfs: invalid session number or type of track\n"); + pr_err("invalid session number or type of track\n"); return -EINVAL; } ms_info.addr_format = CDROM_LBA; @@ -101,7 +101,7 @@ int hfs_mdb_get(struct super_block *sb) HFS_SB(sb)->alloc_blksz = size = be32_to_cpu(mdb->drAlBlkSiz); if (!size || (size & (HFS_SECTOR_SIZE - 1))) { - printk(KERN_ERR "hfs: bad allocation block size %d\n", size); + pr_err("bad allocation block size %d\n", size); goto out_bh; } @@ -118,7 +118,7 @@ int hfs_mdb_get(struct super_block *sb) size >>= 1; brelse(bh); if (!sb_set_blocksize(sb, size)) { - printk(KERN_ERR "hfs: unable to set blocksize to %u\n", size); + pr_err("unable to set blocksize to %u\n", size); goto out; } @@ -162,8 +162,8 @@ int hfs_mdb_get(struct super_block *sb) } if (!HFS_SB(sb)->alt_mdb) { - printk(KERN_WARNING "hfs: unable to locate alternate MDB\n"); - printk(KERN_WARNING "hfs: continuing without an alternate MDB\n"); + pr_warn("unable to locate alternate MDB\n"); + pr_warn("continuing without an alternate MDB\n"); } HFS_SB(sb)->bitmap = (__be32 *)__get_free_pages(GFP_KERNEL, PAGE_SIZE < 8192 ? 1 : 0); @@ -178,7 +178,7 @@ int hfs_mdb_get(struct super_block *sb) while (size) { bh = sb_bread(sb, off >> sb->s_blocksize_bits); if (!bh) { - printk(KERN_ERR "hfs: unable to read volume bitmap\n"); + pr_err("unable to read volume bitmap\n"); goto out; } off2 = off & (sb->s_blocksize - 1); @@ -192,23 +192,22 @@ int hfs_mdb_get(struct super_block *sb) HFS_SB(sb)->ext_tree = hfs_btree_open(sb, HFS_EXT_CNID, hfs_ext_keycmp); if (!HFS_SB(sb)->ext_tree) { - printk(KERN_ERR "hfs: unable to open extent tree\n"); + pr_err("unable to open extent tree\n"); goto out; } HFS_SB(sb)->cat_tree = hfs_btree_open(sb, HFS_CAT_CNID, hfs_cat_keycmp); if (!HFS_SB(sb)->cat_tree) { - printk(KERN_ERR "hfs: unable to open catalog tree\n"); + pr_err("unable to open catalog tree\n"); goto out; } attrib = mdb->drAtrb; if (!(attrib & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) { - printk(KERN_WARNING "hfs: filesystem was not cleanly unmounted, " - "running fsck.hfs is recommended. mounting read-only.\n"); + pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended. mounting read-only.\n"); sb->s_flags |= MS_RDONLY; } if ((attrib & cpu_to_be16(HFS_SB_ATTRIB_SLOCK))) { - printk(KERN_WARNING "hfs: filesystem is marked locked, mounting read-only.\n"); + pr_warn("filesystem is marked locked, mounting read-only.\n"); sb->s_flags |= MS_RDONLY; } if (!(sb->s_flags & MS_RDONLY)) { @@ -312,7 +311,7 @@ void hfs_mdb_commit(struct super_block *sb) while (size) { bh = sb_bread(sb, block); if (!bh) { - printk(KERN_ERR "hfs: unable to read volume bitmap\n"); + pr_err("unable to read volume bitmap\n"); break; } len = min((int)sb->s_blocksize - off, size); diff --git a/fs/hfs/super.c b/fs/hfs/super.c index e93ddaadfd1e..2d2039e754cd 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -117,12 +117,11 @@ static int hfs_remount(struct super_block *sb, int *flags, char *data) return 0; if (!(*flags & MS_RDONLY)) { if (!(HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) { - printk(KERN_WARNING "hfs: filesystem was not cleanly unmounted, " - "running fsck.hfs is recommended. leaving read-only.\n"); + pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended. leaving read-only.\n"); sb->s_flags |= MS_RDONLY; *flags |= MS_RDONLY; } else if (HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_SLOCK)) { - printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n"); + pr_warn("filesystem is marked locked, leaving read-only.\n"); sb->s_flags |= MS_RDONLY; *flags |= MS_RDONLY; } @@ -253,29 +252,29 @@ static int parse_options(char *options, struct hfs_sb_info *hsb) switch (token) { case opt_uid: if (match_int(&args[0], &tmp)) { - printk(KERN_ERR "hfs: uid requires an argument\n"); + pr_err("uid requires an argument\n"); return 0; } hsb->s_uid = make_kuid(current_user_ns(), (uid_t)tmp); if (!uid_valid(hsb->s_uid)) { - printk(KERN_ERR "hfs: invalid uid %d\n", tmp); + pr_err("invalid uid %d\n", tmp); return 0; } break; case opt_gid: if (match_int(&args[0], &tmp)) { - printk(KERN_ERR "hfs: gid requires an argument\n"); + pr_err("gid requires an argument\n"); return 0; } hsb->s_gid = make_kgid(current_user_ns(), (gid_t)tmp); if (!gid_valid(hsb->s_gid)) { - printk(KERN_ERR "hfs: invalid gid %d\n", tmp); + pr_err("invalid gid %d\n", tmp); return 0; } break; case opt_umask: if (match_octal(&args[0], &tmp)) { - printk(KERN_ERR "hfs: umask requires a value\n"); + pr_err("umask requires a value\n"); return 0; } hsb->s_file_umask = (umode_t)tmp; @@ -283,39 +282,39 @@ static int parse_options(char *options, struct hfs_sb_info *hsb) break; case opt_file_umask: if (match_octal(&args[0], &tmp)) { - printk(KERN_ERR "hfs: file_umask requires a value\n"); + pr_err("file_umask requires a value\n"); return 0; } hsb->s_file_umask = (umode_t)tmp; break; case opt_dir_umask: if (match_octal(&args[0], &tmp)) { - printk(KERN_ERR "hfs: dir_umask requires a value\n"); + pr_err("dir_umask requires a value\n"); return 0; } hsb->s_dir_umask = (umode_t)tmp; break; case opt_part: if (match_int(&args[0], &hsb->part)) { - printk(KERN_ERR "hfs: part requires an argument\n"); + pr_err("part requires an argument\n"); return 0; } break; case opt_session: if (match_int(&args[0], &hsb->session)) { - printk(KERN_ERR "hfs: session requires an argument\n"); + pr_err("session requires an argument\n"); return 0; } break; case opt_type: if (match_fourchar(&args[0], &hsb->s_type)) { - printk(KERN_ERR "hfs: type requires a 4 character value\n"); + pr_err("type requires a 4 character value\n"); return 0; } break; case opt_creator: if (match_fourchar(&args[0], &hsb->s_creator)) { - printk(KERN_ERR "hfs: creator requires a 4 character value\n"); + pr_err("creator requires a 4 character value\n"); return 0; } break; @@ -324,14 +323,14 @@ static int parse_options(char *options, struct hfs_sb_info *hsb) break; case opt_codepage: if (hsb->nls_disk) { - printk(KERN_ERR "hfs: unable to change codepage\n"); + pr_err("unable to change codepage\n"); return 0; } p = match_strdup(&args[0]); if (p) hsb->nls_disk = load_nls(p); if (!hsb->nls_disk) { - printk(KERN_ERR "hfs: unable to load codepage \"%s\"\n", p); + pr_err("unable to load codepage \"%s\"\n", p); kfree(p); return 0; } @@ -339,14 +338,14 @@ static int parse_options(char *options, struct hfs_sb_info *hsb) break; case opt_iocharset: if (hsb->nls_io) { - printk(KERN_ERR "hfs: unable to change iocharset\n"); + pr_err("unable to change iocharset\n"); return 0; } p = match_strdup(&args[0]); if (p) hsb->nls_io = load_nls(p); if (!hsb->nls_io) { - printk(KERN_ERR "hfs: unable to load iocharset \"%s\"\n", p); + pr_err("unable to load iocharset \"%s\"\n", p); kfree(p); return 0; } @@ -360,7 +359,7 @@ static int parse_options(char *options, struct hfs_sb_info *hsb) if (hsb->nls_disk && !hsb->nls_io) { hsb->nls_io = load_nls_default(); if (!hsb->nls_io) { - printk(KERN_ERR "hfs: unable to load default iocharset\n"); + pr_err("unable to load default iocharset\n"); return 0; } } @@ -400,7 +399,7 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent) res = -EINVAL; if (!parse_options((char *)data, sbi)) { - printk(KERN_ERR "hfs: unable to parse mount options.\n"); + pr_err("unable to parse mount options\n"); goto bail; } @@ -411,14 +410,16 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent) res = hfs_mdb_get(sb); if (res) { if (!silent) - printk(KERN_WARNING "hfs: can't find a HFS filesystem on dev %s.\n", + pr_warn("can't find a HFS filesystem on dev %s\n", hfs_mdb_name(sb)); res = -EINVAL; goto bail; } /* try to get the root inode */ - hfs_find_init(HFS_SB(sb)->cat_tree, &fd); + res = hfs_find_init(HFS_SB(sb)->cat_tree, &fd); + if (res) + goto bail_no_root; res = hfs_cat_find_brec(sb, HFS_ROOT_CNID, &fd); if (!res) { if (fd.entrylength > sizeof(rec) || fd.entrylength < 0) { @@ -447,7 +448,7 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent) return 0; bail_no_root: - printk(KERN_ERR "hfs: get root inode failed.\n"); + pr_err("get root inode failed\n"); bail: hfs_mdb_put(sb); return res; @@ -466,6 +467,7 @@ static struct file_system_type hfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("hfs"); static void hfs_init_once(void *p) { diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c index 8d691f124714..0f47890299c4 100644 --- a/fs/hfsplus/attributes.c +++ b/fs/hfsplus/attributes.c @@ -56,7 +56,7 @@ int hfsplus_attr_build_key(struct super_block *sb, hfsplus_btree_key *key, if (name) { len = strlen(name); if (len > HFSPLUS_ATTR_MAX_STRLEN) { - printk(KERN_ERR "hfs: invalid xattr name's length\n"); + pr_err("invalid xattr name's length\n"); return -EINVAL; } hfsplus_asc2uni(sb, @@ -166,10 +166,10 @@ int hfsplus_find_attr(struct super_block *sb, u32 cnid, { int err = 0; - dprint(DBG_ATTR_MOD, "find_attr: %s,%d\n", name ? name : NULL, cnid); + hfs_dbg(ATTR_MOD, "find_attr: %s,%d\n", name ? name : NULL, cnid); if (!HFSPLUS_SB(sb)->attr_tree) { - printk(KERN_ERR "hfs: attributes file doesn't exist\n"); + pr_err("attributes file doesn't exist\n"); return -EINVAL; } @@ -228,11 +228,11 @@ int hfsplus_create_attr(struct inode *inode, int entry_size; int err; - dprint(DBG_ATTR_MOD, "create_attr: %s,%ld\n", + hfs_dbg(ATTR_MOD, "create_attr: %s,%ld\n", name ? name : NULL, inode->i_ino); if (!HFSPLUS_SB(sb)->attr_tree) { - printk(KERN_ERR "hfs: attributes file doesn't exist\n"); + pr_err("attributes file doesn't exist\n"); return -EINVAL; } @@ -307,10 +307,10 @@ static int __hfsplus_delete_attr(struct inode *inode, u32 cnid, break; case HFSPLUS_ATTR_FORK_DATA: case HFSPLUS_ATTR_EXTENTS: - printk(KERN_ERR "hfs: only inline data xattr are supported\n"); + pr_err("only inline data xattr are supported\n"); return -EOPNOTSUPP; default: - printk(KERN_ERR "hfs: invalid extended attribute record\n"); + pr_err("invalid extended attribute record\n"); return -ENOENT; } @@ -328,11 +328,11 @@ int hfsplus_delete_attr(struct inode *inode, const char *name) struct super_block *sb = inode->i_sb; struct hfs_find_data fd; - dprint(DBG_ATTR_MOD, "delete_attr: %s,%ld\n", + hfs_dbg(ATTR_MOD, "delete_attr: %s,%ld\n", name ? name : NULL, inode->i_ino); if (!HFSPLUS_SB(sb)->attr_tree) { - printk(KERN_ERR "hfs: attributes file doesn't exist\n"); + pr_err("attributes file doesn't exist\n"); return -EINVAL; } @@ -346,7 +346,7 @@ int hfsplus_delete_attr(struct inode *inode, const char *name) if (err) goto out; } else { - printk(KERN_ERR "hfs: invalid extended attribute name\n"); + pr_err("invalid extended attribute name\n"); err = -EINVAL; goto out; } @@ -369,10 +369,10 @@ int hfsplus_delete_all_attrs(struct inode *dir, u32 cnid) int err = 0; struct hfs_find_data fd; - dprint(DBG_ATTR_MOD, "delete_all_attrs: %d\n", cnid); + hfs_dbg(ATTR_MOD, "delete_all_attrs: %d\n", cnid); if (!HFSPLUS_SB(dir->i_sb)->attr_tree) { - printk(KERN_ERR "hfs: attributes file doesn't exist\n"); + pr_err("attributes file doesn't exist\n"); return -EINVAL; } @@ -384,7 +384,7 @@ int hfsplus_delete_all_attrs(struct inode *dir, u32 cnid) err = hfsplus_find_attr(dir->i_sb, cnid, NULL, &fd); if (err) { if (err != -ENOENT) - printk(KERN_ERR "hfs: xattr search failed.\n"); + pr_err("xattr search failed\n"); goto end_delete_all; } diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c index d73c98d1ee99..c1422d91cd36 100644 --- a/fs/hfsplus/bfind.c +++ b/fs/hfsplus/bfind.c @@ -22,7 +22,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) return -ENOMEM; fd->search_key = ptr; fd->key = ptr + tree->max_key_len + 2; - dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", + hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); switch (tree->cnid) { case HFSPLUS_CAT_CNID: @@ -44,7 +44,7 @@ void hfs_find_exit(struct hfs_find_data *fd) { hfs_bnode_put(fd->bnode); kfree(fd->search_key); - dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", + hfs_dbg(BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); mutex_unlock(&fd->tree->tree_lock); fd->tree = NULL; @@ -56,7 +56,8 @@ int hfs_find_1st_rec_by_cnid(struct hfs_bnode *bnode, int *end, int *cur_rec) { - __be32 cur_cnid, search_cnid; + __be32 cur_cnid; + __be32 search_cnid; if (bnode->tree->cnid == HFSPLUS_EXT_CNID) { cur_cnid = fd->key->ext.cnid; @@ -67,8 +68,11 @@ int hfs_find_1st_rec_by_cnid(struct hfs_bnode *bnode, } else if (bnode->tree->cnid == HFSPLUS_ATTR_CNID) { cur_cnid = fd->key->attr.cnid; search_cnid = fd->search_key->attr.cnid; - } else + } else { + cur_cnid = 0; /* used-uninitialized warning */ + search_cnid = 0; BUG(); + } if (cur_cnid == search_cnid) { (*end) = (*cur_rec); @@ -204,7 +208,7 @@ int hfs_brec_find(struct hfs_find_data *fd, search_strategy_t do_key_compare) return res; invalid: - printk(KERN_ERR "hfs: inconsistency in B*Tree (%d,%d,%d,%u,%u)\n", + pr_err("inconsistency in B*Tree (%d,%d,%d,%u,%u)\n", height, bnode->height, bnode->type, nidx, parent); res = -EIO; release: diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c index 6feefc0cb48a..d2954451519e 100644 --- a/fs/hfsplus/bitmap.c +++ b/fs/hfsplus/bitmap.c @@ -30,7 +30,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, if (!len) return size; - dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len); + hfs_dbg(BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len); mutex_lock(&sbi->alloc_mutex); mapping = sbi->alloc_file->i_mapping; page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); @@ -89,14 +89,14 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, else end = pptr + ((size + 31) & (PAGE_CACHE_BITS - 1)) / 32; } - dprint(DBG_BITMAP, "bitmap full\n"); + hfs_dbg(BITMAP, "bitmap full\n"); start = size; goto out; found: start = offset + (curr - pptr) * 32 + i; if (start >= size) { - dprint(DBG_BITMAP, "bitmap full\n"); + hfs_dbg(BITMAP, "bitmap full\n"); goto out; } /* do any partial u32 at the start */ @@ -154,7 +154,7 @@ done: *max = offset + (curr - pptr) * 32 + i - start; sbi->free_blocks -= *max; hfsplus_mark_mdb_dirty(sb); - dprint(DBG_BITMAP, "-> %u,%u\n", start, *max); + hfs_dbg(BITMAP, "-> %u,%u\n", start, *max); out: mutex_unlock(&sbi->alloc_mutex); return start; @@ -173,7 +173,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) if (!count) return 0; - dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count); + hfs_dbg(BITMAP, "block_free: %u,%u\n", offset, count); /* are all of the bits in range? */ if ((offset + count) > sbi->total_blocks) return -ENOENT; @@ -238,8 +238,7 @@ out: return 0; kaboom: - printk(KERN_CRIT "hfsplus: unable to mark blocks free: error %ld\n", - PTR_ERR(page)); + pr_crit("unable to mark blocks free: error %ld\n", PTR_ERR(page)); mutex_unlock(&sbi->alloc_mutex); return -EIO; diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index f31ac6f404f1..11c860204520 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -130,7 +130,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, struct page **src_page, **dst_page; int l; - dprint(DBG_BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); + hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); if (!len) return; tree = src_node->tree; @@ -188,7 +188,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) struct page **src_page, **dst_page; int l; - dprint(DBG_BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); + hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); if (!len) return; src += node->page_offset; @@ -302,16 +302,16 @@ void hfs_bnode_dump(struct hfs_bnode *node) __be32 cnid; int i, off, key_off; - dprint(DBG_BNODE_MOD, "bnode: %d\n", node->this); + hfs_dbg(BNODE_MOD, "bnode: %d\n", node->this); hfs_bnode_read(node, &desc, 0, sizeof(desc)); - dprint(DBG_BNODE_MOD, "%d, %d, %d, %d, %d\n", + hfs_dbg(BNODE_MOD, "%d, %d, %d, %d, %d\n", be32_to_cpu(desc.next), be32_to_cpu(desc.prev), desc.type, desc.height, be16_to_cpu(desc.num_recs)); off = node->tree->node_size - 2; for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) { key_off = hfs_bnode_read_u16(node, off); - dprint(DBG_BNODE_MOD, " %d", key_off); + hfs_dbg(BNODE_MOD, " %d", key_off); if (i && node->type == HFS_NODE_INDEX) { int tmp; @@ -320,17 +320,17 @@ void hfs_bnode_dump(struct hfs_bnode *node) tmp = hfs_bnode_read_u16(node, key_off) + 2; else tmp = node->tree->max_key_len + 2; - dprint(DBG_BNODE_MOD, " (%d", tmp); + hfs_dbg_cont(BNODE_MOD, " (%d", tmp); hfs_bnode_read(node, &cnid, key_off + tmp, 4); - dprint(DBG_BNODE_MOD, ",%d)", be32_to_cpu(cnid)); + hfs_dbg_cont(BNODE_MOD, ",%d)", be32_to_cpu(cnid)); } else if (i && node->type == HFS_NODE_LEAF) { int tmp; tmp = hfs_bnode_read_u16(node, key_off); - dprint(DBG_BNODE_MOD, " (%d)", tmp); + hfs_dbg_cont(BNODE_MOD, " (%d)", tmp); } } - dprint(DBG_BNODE_MOD, "\n"); + hfs_dbg_cont(BNODE_MOD, "\n"); } void hfs_bnode_unlink(struct hfs_bnode *node) @@ -366,7 +366,7 @@ void hfs_bnode_unlink(struct hfs_bnode *node) /* move down? */ if (!node->prev && !node->next) - dprint(DBG_BNODE_MOD, "hfs_btree_del_level\n"); + hfs_dbg(BNODE_MOD, "hfs_btree_del_level\n"); if (!node->parent) { tree->root = 0; tree->depth = 0; @@ -386,7 +386,7 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid) struct hfs_bnode *node; if (cnid >= tree->node_count) { - printk(KERN_ERR "hfs: request for non-existent node " + pr_err("request for non-existent node " "%d in B*Tree\n", cnid); return NULL; @@ -409,7 +409,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) loff_t off; if (cnid >= tree->node_count) { - printk(KERN_ERR "hfs: request for non-existent node " + pr_err("request for non-existent node " "%d in B*Tree\n", cnid); return NULL; @@ -425,8 +425,8 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) node->this = cnid; set_bit(HFS_BNODE_NEW, &node->flags); atomic_set(&node->refcnt, 1); - dprint(DBG_BNODE_REFS, "new_node(%d:%d): 1\n", - node->tree->cnid, node->this); + hfs_dbg(BNODE_REFS, "new_node(%d:%d): 1\n", + node->tree->cnid, node->this); init_waitqueue_head(&node->lock_wq); spin_lock(&tree->hash_lock); node2 = hfs_bnode_findhash(tree, cnid); @@ -470,7 +470,7 @@ void hfs_bnode_unhash(struct hfs_bnode *node) { struct hfs_bnode **p; - dprint(DBG_BNODE_REFS, "remove_node(%d:%d): %d\n", + hfs_dbg(BNODE_REFS, "remove_node(%d:%d): %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)]; *p && *p != node; p = &(*p)->next_hash) @@ -588,7 +588,7 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num) node = hfs_bnode_findhash(tree, num); spin_unlock(&tree->hash_lock); if (node) { - printk(KERN_CRIT "new node %u already hashed?\n", num); + pr_crit("new node %u already hashed?\n", num); WARN_ON(1); return node; } @@ -620,7 +620,7 @@ void hfs_bnode_get(struct hfs_bnode *node) { if (node) { atomic_inc(&node->refcnt); - dprint(DBG_BNODE_REFS, "get_node(%d:%d): %d\n", + hfs_dbg(BNODE_REFS, "get_node(%d:%d): %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); } @@ -633,7 +633,7 @@ void hfs_bnode_put(struct hfs_bnode *node) struct hfs_btree *tree = node->tree; int i; - dprint(DBG_BNODE_REFS, "put_node(%d:%d): %d\n", + hfs_dbg(BNODE_REFS, "put_node(%d:%d): %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); BUG_ON(!atomic_read(&node->refcnt)); diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c index 298d4e45604b..6e560d56094b 100644 --- a/fs/hfsplus/brec.c +++ b/fs/hfsplus/brec.c @@ -45,13 +45,13 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec) if (!recoff) return 0; if (recoff > node->tree->node_size - 2) { - printk(KERN_ERR "hfs: recoff %d too large\n", recoff); + pr_err("recoff %d too large\n", recoff); return 0; } retval = hfs_bnode_read_u16(node, recoff) + 2; if (retval > node->tree->max_key_len + 2) { - printk(KERN_ERR "hfs: keylen %d too large\n", + pr_err("keylen %d too large\n", retval); retval = 0; } @@ -90,7 +90,7 @@ again: end_rec_off = tree->node_size - (node->num_recs + 1) * 2; end_off = hfs_bnode_read_u16(node, end_rec_off); end_rec_off -= 2; - dprint(DBG_BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", + hfs_dbg(BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", rec, size, end_off, end_rec_off); if (size > end_rec_off - end_off) { if (new_node) @@ -191,7 +191,7 @@ again: mark_inode_dirty(tree->inode); } hfs_bnode_dump(node); - dprint(DBG_BNODE_MOD, "remove_rec: %d, %d\n", + hfs_dbg(BNODE_MOD, "remove_rec: %d, %d\n", fd->record, fd->keylength + fd->entrylength); if (!--node->num_recs) { hfs_bnode_unlink(node); @@ -244,7 +244,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) if (IS_ERR(new_node)) return new_node; hfs_bnode_get(node); - dprint(DBG_BNODE_MOD, "split_nodes: %d - %d - %d\n", + hfs_dbg(BNODE_MOD, "split_nodes: %d - %d - %d\n", node->this, new_node->this, node->next); new_node->next = node->next; new_node->prev = node->this; @@ -379,7 +379,7 @@ again: newkeylen = hfs_bnode_read_u16(node, 14) + 2; else fd->keylength = newkeylen = tree->max_key_len + 2; - dprint(DBG_BNODE_MOD, "update_rec: %d, %d, %d\n", + hfs_dbg(BNODE_MOD, "update_rec: %d, %d, %d\n", rec, fd->keylength, newkeylen); rec_off = tree->node_size - (rec + 2) * 2; @@ -391,7 +391,7 @@ again: end_off = hfs_bnode_read_u16(parent, end_rec_off); if (end_rec_off - end_off < diff) { - dprint(DBG_BNODE_MOD, "hfs: splitting index node.\n"); + hfs_dbg(BNODE_MOD, "splitting index node\n"); fd->bnode = parent; new_node = hfs_bnode_split(fd); if (IS_ERR(new_node)) diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index efb689c21a95..0c6540c91167 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -40,8 +40,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) tree->inode = inode; if (!HFSPLUS_I(tree->inode)->first_blocks) { - printk(KERN_ERR - "hfs: invalid btree extent records (0 size).\n"); + pr_err("invalid btree extent records (0 size)\n"); goto free_inode; } @@ -68,12 +67,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) switch (id) { case HFSPLUS_EXT_CNID: if (tree->max_key_len != HFSPLUS_EXT_KEYLEN - sizeof(u16)) { - printk(KERN_ERR "hfs: invalid extent max_key_len %d\n", + pr_err("invalid extent max_key_len %d\n", tree->max_key_len); goto fail_page; } if (tree->attributes & HFS_TREE_VARIDXKEYS) { - printk(KERN_ERR "hfs: invalid extent btree flag\n"); + pr_err("invalid extent btree flag\n"); goto fail_page; } @@ -81,12 +80,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) break; case HFSPLUS_CAT_CNID: if (tree->max_key_len != HFSPLUS_CAT_KEYLEN - sizeof(u16)) { - printk(KERN_ERR "hfs: invalid catalog max_key_len %d\n", + pr_err("invalid catalog max_key_len %d\n", tree->max_key_len); goto fail_page; } if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) { - printk(KERN_ERR "hfs: invalid catalog btree flag\n"); + pr_err("invalid catalog btree flag\n"); goto fail_page; } @@ -100,19 +99,19 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) break; case HFSPLUS_ATTR_CNID: if (tree->max_key_len != HFSPLUS_ATTR_KEYLEN - sizeof(u16)) { - printk(KERN_ERR "hfs: invalid attributes max_key_len %d\n", + pr_err("invalid attributes max_key_len %d\n", tree->max_key_len); goto fail_page; } tree->keycmp = hfsplus_attr_bin_cmp_key; break; default: - printk(KERN_ERR "hfs: unknown B*Tree requested\n"); + pr_err("unknown B*Tree requested\n"); goto fail_page; } if (!(tree->attributes & HFS_TREE_BIGKEYS)) { - printk(KERN_ERR "hfs: invalid btree flag\n"); + pr_err("invalid btree flag\n"); goto fail_page; } @@ -155,7 +154,7 @@ void hfs_btree_close(struct hfs_btree *tree) while ((node = tree->node_hash[i])) { tree->node_hash[i] = node->next_hash; if (atomic_read(&node->refcnt)) - printk(KERN_CRIT "hfs: node %d:%d " + pr_crit("node %d:%d " "still has %d user(s)!\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); @@ -303,7 +302,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) kunmap(*pagep); nidx = node->next; if (!nidx) { - dprint(DBG_BNODE_MOD, "hfs: create new bmap node.\n"); + hfs_dbg(BNODE_MOD, "create new bmap node\n"); next_node = hfs_bmap_new_bmap(node, idx); } else next_node = hfs_bnode_find(tree, nidx); @@ -329,7 +328,7 @@ void hfs_bmap_free(struct hfs_bnode *node) u32 nidx; u8 *data, byte, m; - dprint(DBG_BNODE_MOD, "btree_free_node: %u\n", node->this); + hfs_dbg(BNODE_MOD, "btree_free_node: %u\n", node->this); BUG_ON(!node->this); tree = node->tree; nidx = node->this; @@ -345,7 +344,7 @@ void hfs_bmap_free(struct hfs_bnode *node) hfs_bnode_put(node); if (!i) { /* panic */; - printk(KERN_CRIT "hfs: unable to free bnode %u. " + pr_crit("unable to free bnode %u. " "bmap not found!\n", node->this); return; @@ -355,7 +354,7 @@ void hfs_bmap_free(struct hfs_bnode *node) return; if (node->type != HFS_NODE_MAP) { /* panic */; - printk(KERN_CRIT "hfs: invalid bmap found! " + pr_crit("invalid bmap found! " "(%u,%d)\n", node->this, node->type); hfs_bnode_put(node); @@ -370,7 +369,7 @@ void hfs_bmap_free(struct hfs_bnode *node) m = 1 << (~nidx & 7); byte = data[off]; if (!(byte & m)) { - printk(KERN_CRIT "hfs: trying to free free bnode " + pr_crit("trying to free free bnode " "%u(%d)\n", node->this, node->type); kunmap(page); diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index 840d71edd193..968ce411db53 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -188,12 +188,12 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid, type = be16_to_cpu(tmp.type); if (type != HFSPLUS_FOLDER_THREAD && type != HFSPLUS_FILE_THREAD) { - printk(KERN_ERR "hfs: found bad thread record in catalog\n"); + pr_err("found bad thread record in catalog\n"); return -EIO; } if (be16_to_cpu(tmp.thread.nodeName.length) > 255) { - printk(KERN_ERR "hfs: catalog name length corrupted\n"); + pr_err("catalog name length corrupted\n"); return -EIO; } @@ -212,7 +212,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, int entry_size; int err; - dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", + hfs_dbg(CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink); err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); if (err) @@ -271,8 +271,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) int err, off; u16 type; - dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", - str ? str->name : NULL, cnid); + hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); if (err) return err; @@ -361,7 +360,7 @@ int hfsplus_rename_cat(u32 cnid, int entry_size, type; int err; - dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", + hfs_dbg(CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name, dst_dir->i_ino, dst_name->name); err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd); diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 031c24e50521..a37ac934732f 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -103,7 +103,7 @@ again: } else if (!dentry->d_fsdata) dentry->d_fsdata = (void *)(unsigned long)cnid; } else { - printk(KERN_ERR "hfs: invalid catalog entry type in lookup\n"); + pr_err("invalid catalog entry type in lookup\n"); err = -EIO; goto fail; } @@ -159,12 +159,12 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); if (be16_to_cpu(entry.type) != HFSPLUS_FOLDER_THREAD) { - printk(KERN_ERR "hfs: bad catalog folder thread\n"); + pr_err("bad catalog folder thread\n"); err = -EIO; goto out; } if (fd.entrylength < HFSPLUS_MIN_THREAD_SZ) { - printk(KERN_ERR "hfs: truncated catalog thread\n"); + pr_err("truncated catalog thread\n"); err = -EIO; goto out; } @@ -183,7 +183,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) for (;;) { if (be32_to_cpu(fd.key->cat.parent) != inode->i_ino) { - printk(KERN_ERR "hfs: walked past end of dir\n"); + pr_err("walked past end of dir\n"); err = -EIO; goto out; } @@ -203,7 +203,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) if (type == HFSPLUS_FOLDER) { if (fd.entrylength < sizeof(struct hfsplus_cat_folder)) { - printk(KERN_ERR "hfs: small dir entry\n"); + pr_err("small dir entry\n"); err = -EIO; goto out; } @@ -216,7 +216,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) break; } else if (type == HFSPLUS_FILE) { if (fd.entrylength < sizeof(struct hfsplus_cat_file)) { - printk(KERN_ERR "hfs: small file entry\n"); + pr_err("small file entry\n"); err = -EIO; goto out; } @@ -224,7 +224,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) be32_to_cpu(entry.file.id), DT_REG)) break; } else { - printk(KERN_ERR "hfs: bad catalog entry type\n"); + pr_err("bad catalog entry type\n"); err = -EIO; goto out; } diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index a94f0f779d5e..fbb212fbb1ef 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -83,7 +83,7 @@ static u32 hfsplus_ext_lastblock(struct hfsplus_extent *ext) return be32_to_cpu(ext->start_block) + be32_to_cpu(ext->block_count); } -static void __hfsplus_ext_write_extent(struct inode *inode, +static int __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data *fd) { struct hfsplus_inode_info *hip = HFSPLUS_I(inode); @@ -98,13 +98,13 @@ static void __hfsplus_ext_write_extent(struct inode *inode, res = hfs_brec_find(fd, hfs_find_rec_by_key); if (hip->extent_state & HFSPLUS_EXT_NEW) { if (res != -ENOENT) - return; + return res; hfs_brec_insert(fd, hip->cached_extents, sizeof(hfsplus_extent_rec)); hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW); } else { if (res) - return; + return res; hfs_bnode_write(fd->bnode, hip->cached_extents, fd->entryoffset, fd->entrylength); hip->extent_state &= ~HFSPLUS_EXT_DIRTY; @@ -117,11 +117,13 @@ static void __hfsplus_ext_write_extent(struct inode *inode, * to explicily mark the inode dirty, too. */ set_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags); + + return 0; } static int hfsplus_ext_write_extent_locked(struct inode *inode) { - int res; + int res = 0; if (HFSPLUS_I(inode)->extent_state & HFSPLUS_EXT_DIRTY) { struct hfs_find_data fd; @@ -129,10 +131,10 @@ static int hfsplus_ext_write_extent_locked(struct inode *inode) res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd); if (res) return res; - __hfsplus_ext_write_extent(inode, &fd); + res = __hfsplus_ext_write_extent(inode, &fd); hfs_find_exit(&fd); } - return 0; + return res; } int hfsplus_ext_write_extent(struct inode *inode) @@ -175,8 +177,11 @@ static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, WARN_ON(!mutex_is_locked(&hip->extents_lock)); - if (hip->extent_state & HFSPLUS_EXT_DIRTY) - __hfsplus_ext_write_extent(inode, fd); + if (hip->extent_state & HFSPLUS_EXT_DIRTY) { + res = __hfsplus_ext_write_extent(inode, fd); + if (res) + return res; + } res = __hfsplus_ext_read_extent(fd, hip->cached_extents, inode->i_ino, block, HFSPLUS_IS_RSRC(inode) ? @@ -265,7 +270,7 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock, mutex_unlock(&hip->extents_lock); done: - dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", + hfs_dbg(EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock); mask = (1 << sbi->fs_shift) - 1; @@ -288,11 +293,12 @@ static void hfsplus_dump_extent(struct hfsplus_extent *extent) { int i; - dprint(DBG_EXTENT, " "); + hfs_dbg(EXTENT, " "); for (i = 0; i < 8; i++) - dprint(DBG_EXTENT, " %u:%u", be32_to_cpu(extent[i].start_block), - be32_to_cpu(extent[i].block_count)); - dprint(DBG_EXTENT, "\n"); + hfs_dbg_cont(EXTENT, " %u:%u", + be32_to_cpu(extent[i].start_block), + be32_to_cpu(extent[i].block_count)); + hfs_dbg_cont(EXTENT, "\n"); } static int hfsplus_add_extent(struct hfsplus_extent *extent, u32 offset, @@ -348,8 +354,8 @@ found: if (count <= block_nr) { err = hfsplus_block_free(sb, start, count); if (err) { - printk(KERN_ERR "hfs: can't free extent\n"); - dprint(DBG_EXTENT, " start: %u count: %u\n", + pr_err("can't free extent\n"); + hfs_dbg(EXTENT, " start: %u count: %u\n", start, count); } extent->block_count = 0; @@ -359,8 +365,8 @@ found: count -= block_nr; err = hfsplus_block_free(sb, start + count, block_nr); if (err) { - printk(KERN_ERR "hfs: can't free extent\n"); - dprint(DBG_EXTENT, " start: %u count: %u\n", + pr_err("can't free extent\n"); + hfs_dbg(EXTENT, " start: %u count: %u\n", start, count); } extent->block_count = cpu_to_be32(count); @@ -432,7 +438,7 @@ int hfsplus_file_extend(struct inode *inode) if (sbi->alloc_file->i_size * 8 < sbi->total_blocks - sbi->free_blocks + 8) { /* extend alloc file */ - printk(KERN_ERR "hfs: extend alloc file! " + pr_err("extend alloc file! " "(%llu,%u,%u)\n", sbi->alloc_file->i_size * 8, sbi->total_blocks, sbi->free_blocks); @@ -459,11 +465,11 @@ int hfsplus_file_extend(struct inode *inode) } } - dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); + hfs_dbg(EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); if (hip->alloc_blocks <= hip->first_blocks) { if (!hip->first_blocks) { - dprint(DBG_EXTENT, "first extents\n"); + hfs_dbg(EXTENT, "first extents\n"); /* no extents yet */ hip->first_extents[0].start_block = cpu_to_be32(start); hip->first_extents[0].block_count = cpu_to_be32(len); @@ -500,7 +506,7 @@ out: return res; insert_extent: - dprint(DBG_EXTENT, "insert new extent\n"); + hfs_dbg(EXTENT, "insert new extent\n"); res = hfsplus_ext_write_extent_locked(inode); if (res) goto out; @@ -525,15 +531,14 @@ void hfsplus_file_truncate(struct inode *inode) u32 alloc_cnt, blk_cnt, start; int res; - dprint(DBG_INODE, "truncate: %lu, %llu -> %llu\n", - inode->i_ino, (long long)hip->phys_size, - inode->i_size); + hfs_dbg(INODE, "truncate: %lu, %llu -> %llu\n", + inode->i_ino, (long long)hip->phys_size, inode->i_size); if (inode->i_size > hip->phys_size) { struct address_space *mapping = inode->i_mapping; struct page *page; void *fsdata; - u32 size = inode->i_size; + loff_t size = inode->i_size; res = pagecache_write_begin(NULL, mapping, size, 0, AOP_FLAG_UNINTERRUPTIBLE, diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 05b11f36024c..60b0a3388b26 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -10,6 +10,12 @@ #ifndef _LINUX_HFSPLUS_FS_H #define _LINUX_HFSPLUS_FS_H +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/fs.h> #include <linux/mutex.h> #include <linux/buffer_head.h> @@ -32,9 +38,17 @@ #endif #define DBG_MASK (0) -#define dprint(flg, fmt, args...) \ - if (flg & DBG_MASK) \ - printk(fmt , ## args) +#define hfs_dbg(flg, fmt, ...) \ +do { \ + if (DBG_##flg & DBG_MASK) \ + printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ +} while (0) + +#define hfs_dbg_cont(flg, fmt, ...) \ +do { \ + if (DBG_##flg & DBG_MASK) \ + pr_cont(fmt, ##__VA_ARGS__); \ +} while (0) /* Runtime config options */ #define HFSPLUS_DEF_CR_TYPE 0x3F3F3F3F /* '????' */ diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 160ccc9cdb4b..f833d35630ab 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -14,6 +14,7 @@ #include <linux/pagemap.h> #include <linux/mpage.h> #include <linux/sched.h> +#include <linux/aio.h> #include "hfsplus_fs.h" #include "hfsplus_raw.h" @@ -357,7 +358,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, if (!error) error = error2; } else { - printk(KERN_ERR "hfs: sync non-existent attributes tree\n"); + pr_err("sync non-existent attributes tree\n"); } } @@ -573,7 +574,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date); HFSPLUS_I(inode)->create_date = file->create_date; } else { - printk(KERN_ERR "hfs: bad catalog entry used to create inode\n"); + pr_err("bad catalog entry used to create inode\n"); res = -EIO; } return res; diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index ed257c671615..968eab5bc1f5 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -113,67 +113,67 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi) switch (token) { case opt_creator: if (match_fourchar(&args[0], &sbi->creator)) { - printk(KERN_ERR "hfs: creator requires a 4 character value\n"); + pr_err("creator requires a 4 character value\n"); return 0; } break; case opt_type: if (match_fourchar(&args[0], &sbi->type)) { - printk(KERN_ERR "hfs: type requires a 4 character value\n"); + pr_err("type requires a 4 character value\n"); return 0; } break; case opt_umask: if (match_octal(&args[0], &tmp)) { - printk(KERN_ERR "hfs: umask requires a value\n"); + pr_err("umask requires a value\n"); return 0; } sbi->umask = (umode_t)tmp; break; case opt_uid: if (match_int(&args[0], &tmp)) { - printk(KERN_ERR "hfs: uid requires an argument\n"); + pr_err("uid requires an argument\n"); return 0; } sbi->uid = make_kuid(current_user_ns(), (uid_t)tmp); if (!uid_valid(sbi->uid)) { - printk(KERN_ERR "hfs: invalid uid specified\n"); + pr_err("invalid uid specified\n"); return 0; } break; case opt_gid: if (match_int(&args[0], &tmp)) { - printk(KERN_ERR "hfs: gid requires an argument\n"); + pr_err("gid requires an argument\n"); return 0; } sbi->gid = make_kgid(current_user_ns(), (gid_t)tmp); if (!gid_valid(sbi->gid)) { - printk(KERN_ERR "hfs: invalid gid specified\n"); + pr_err("invalid gid specified\n"); return 0; } break; case opt_part: if (match_int(&args[0], &sbi->part)) { - printk(KERN_ERR "hfs: part requires an argument\n"); + pr_err("part requires an argument\n"); return 0; } break; case opt_session: if (match_int(&args[0], &sbi->session)) { - printk(KERN_ERR "hfs: session requires an argument\n"); + pr_err("session requires an argument\n"); return 0; } break; case opt_nls: if (sbi->nls) { - printk(KERN_ERR "hfs: unable to change nls mapping\n"); + pr_err("unable to change nls mapping\n"); return 0; } p = match_strdup(&args[0]); if (p) sbi->nls = load_nls(p); if (!sbi->nls) { - printk(KERN_ERR "hfs: unable to load " + pr_err("unable to load " "nls mapping \"%s\"\n", p); kfree(p); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 974c26f96fae..4c4d142cf890 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -132,7 +132,7 @@ static int hfsplus_system_write_inode(struct inode *inode) if (tree) { int err = hfs_btree_write(tree); if (err) { - printk(KERN_ERR "hfs: b-tree write err: %d, ino %lu\n", + pr_err("b-tree write err: %d, ino %lu\n", err, inode->i_ino); return err; } @@ -145,7 +145,7 @@ static int hfsplus_write_inode(struct inode *inode, { int err; - dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino); + hfs_dbg(INODE, "hfsplus_write_inode: %lu\n", inode->i_ino); err = hfsplus_ext_write_extent(inode); if (err) @@ -160,7 +160,7 @@ static int hfsplus_write_inode(struct inode *inode, static void hfsplus_evict_inode(struct inode *inode) { - dprint(DBG_INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino); + hfs_dbg(INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino); truncate_inode_pages(&inode->i_data, 0); clear_inode(inode); if (HFSPLUS_IS_RSRC(inode)) { @@ -179,7 +179,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) if (!wait) return 0; - dprint(DBG_SUPER, "hfsplus_sync_fs\n"); + hfs_dbg(SUPER, "hfsplus_sync_fs\n"); /* * Explicitly write out the special metadata inodes. @@ -251,7 +251,7 @@ static void delayed_sync_fs(struct work_struct *work) err = hfsplus_sync_fs(sbi->alloc_file->i_sb, 1); if (err) - printk(KERN_ERR "hfs: delayed sync fs err %d\n", err); + pr_err("delayed sync fs err %d\n", err); } void hfsplus_mark_mdb_dirty(struct super_block *sb) @@ -275,7 +275,7 @@ static void hfsplus_put_super(struct super_block *sb) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); - dprint(DBG_SUPER, "hfsplus_put_super\n"); + hfs_dbg(SUPER, "hfsplus_put_super\n"); cancel_delayed_work_sync(&sbi->sync_work); @@ -333,25 +333,19 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data) return -EINVAL; if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) { - printk(KERN_WARNING "hfs: filesystem was " - "not cleanly unmounted, " - "running fsck.hfsplus is recommended. " - "leaving read-only.\n"); + pr_warn("filesystem was not cleanly unmounted, running fsck.hfsplus is recommended. leaving read-only.\n"); sb->s_flags |= MS_RDONLY; *flags |= MS_RDONLY; } else if (force) { /* nothing */ } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { - printk(KERN_WARNING "hfs: filesystem is marked locked, " - "leaving read-only.\n"); + pr_warn("filesystem is marked locked, leaving read-only.\n"); sb->s_flags |= MS_RDONLY; *flags |= MS_RDONLY; } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) { - printk(KERN_WARNING "hfs: filesystem is " - "marked journaled, " - "leaving read-only.\n"); + pr_warn("filesystem is marked journaled, leaving read-only.\n"); sb->s_flags |= MS_RDONLY; *flags |= MS_RDONLY; } @@ -397,7 +391,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) err = -EINVAL; if (!hfsplus_parse_options(data, sbi)) { - printk(KERN_ERR "hfs: unable to parse mount options\n"); + pr_err("unable to parse mount options\n"); goto out_unload_nls; } @@ -405,14 +399,14 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) nls = sbi->nls; sbi->nls = load_nls("utf8"); if (!sbi->nls) { - printk(KERN_ERR "hfs: unable to load nls for utf8\n"); + pr_err("unable to load nls for utf8\n"); goto out_unload_nls; } /* Grab the volume header */ if (hfsplus_read_wrapper(sb)) { if (!silent) - printk(KERN_WARNING "hfs: unable to find HFS+ superblock\n"); + pr_warn("unable to find HFS+ superblock\n"); goto out_unload_nls; } vhdr = sbi->s_vhdr; @@ -421,7 +415,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = HFSPLUS_VOLHEAD_SIG; if (be16_to_cpu(vhdr->version) < HFSPLUS_MIN_VERSION || be16_to_cpu(vhdr->version) > HFSPLUS_CURRENT_VERSION) { - printk(KERN_ERR "hfs: wrong filesystem version\n"); + pr_err("wrong filesystem version\n"); goto out_free_vhdr; } sbi->total_blocks = be32_to_cpu(vhdr->total_blocks); @@ -445,7 +439,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) if ((last_fs_block > (sector_t)(~0ULL) >> (sbi->alloc_blksz_shift - 9)) || (last_fs_page > (pgoff_t)(~0ULL))) { - printk(KERN_ERR "hfs: filesystem size too large.\n"); + pr_err("filesystem size too large\n"); goto out_free_vhdr; } @@ -454,22 +448,16 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) sb->s_maxbytes = MAX_LFS_FILESIZE; if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) { - printk(KERN_WARNING "hfs: Filesystem was " - "not cleanly unmounted, " - "running fsck.hfsplus is recommended. " - "mounting read-only.\n"); + pr_warn("Filesystem was not cleanly unmounted, running fsck.hfsplus is recommended. mounting read-only.\n"); sb->s_flags |= MS_RDONLY; } else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) { /* nothing */ } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { - printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n"); + pr_warn("Filesystem is marked locked, mounting read-only.\n"); sb->s_flags |= MS_RDONLY; } else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) && !(sb->s_flags & MS_RDONLY)) { - printk(KERN_WARNING "hfs: write access to " - "a journaled filesystem is not supported, " - "use the force option at your own risk, " - "mounting read-only.\n"); + pr_warn("write access to a journaled filesystem is not supported, use the force option at your own risk, mounting read-only.\n"); sb->s_flags |= MS_RDONLY; } @@ -478,18 +466,18 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) /* Load metadata objects (B*Trees) */ sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID); if (!sbi->ext_tree) { - printk(KERN_ERR "hfs: failed to load extents file\n"); + pr_err("failed to load extents file\n"); goto out_free_vhdr; } sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID); if (!sbi->cat_tree) { - printk(KERN_ERR "hfs: failed to load catalog file\n"); + pr_err("failed to load catalog file\n"); goto out_close_ext_tree; } if (vhdr->attr_file.total_blocks != 0) { sbi->attr_tree = hfs_btree_open(sb, HFSPLUS_ATTR_CNID); if (!sbi->attr_tree) { - printk(KERN_ERR "hfs: failed to load attributes file\n"); + pr_err("failed to load attributes file\n"); goto out_close_cat_tree; } } @@ -497,7 +485,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) inode = hfsplus_iget(sb, HFSPLUS_ALLOC_CNID); if (IS_ERR(inode)) { - printk(KERN_ERR "hfs: failed to load allocation file\n"); + pr_err("failed to load allocation file\n"); err = PTR_ERR(inode); goto out_close_attr_tree; } @@ -506,7 +494,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) /* Load the root directory */ root = hfsplus_iget(sb, HFSPLUS_ROOT_CNID); if (IS_ERR(root)) { - printk(KERN_ERR "hfs: failed to load root directory\n"); + pr_err("failed to load root directory\n"); err = PTR_ERR(root); goto out_put_alloc_file; } @@ -654,6 +642,7 @@ static struct file_system_type hfsplus_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("hfsplus"); static void hfsplus_init_once(void *p) { diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 90effcccca9a..b51a6079108d 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -156,7 +156,7 @@ static int hfsplus_get_last_session(struct super_block *sb, *start = (sector_t)te.cdte_addr.lba << 2; return 0; } - printk(KERN_ERR "hfs: invalid session number or type of track\n"); + pr_err("invalid session number or type of track\n"); return -EINVAL; } ms_info.addr_format = CDROM_LBA; @@ -234,8 +234,7 @@ reread: error = -EINVAL; if (sbi->s_backup_vhdr->signature != sbi->s_vhdr->signature) { - printk(KERN_WARNING - "hfs: invalid secondary volume header\n"); + pr_warn("invalid secondary volume header\n"); goto out_free_backup_vhdr; } @@ -259,8 +258,7 @@ reread: blocksize >>= 1; if (sb_set_blocksize(sb, blocksize) != blocksize) { - printk(KERN_ERR "hfs: unable to set blocksize to %u!\n", - blocksize); + pr_err("unable to set blocksize to %u!\n", blocksize); goto out_free_backup_vhdr; } diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index e8a4b0815c61..f66346155df5 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -107,19 +107,19 @@ int __hfsplus_setxattr(struct inode *inode, const char *name, err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &cat_fd); if (err) { - printk(KERN_ERR "hfs: can't init xattr find struct\n"); + pr_err("can't init xattr find struct\n"); return err; } err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &cat_fd); if (err) { - printk(KERN_ERR "hfs: catalog searching failed\n"); + pr_err("catalog searching failed\n"); goto end_setxattr; } if (!strcmp_xattr_finder_info(name)) { if (flags & XATTR_CREATE) { - printk(KERN_ERR "hfs: xattr exists yet\n"); + pr_err("xattr exists yet\n"); err = -EOPNOTSUPP; goto end_setxattr; } @@ -165,7 +165,7 @@ int __hfsplus_setxattr(struct inode *inode, const char *name, if (hfsplus_attr_exists(inode, name)) { if (flags & XATTR_CREATE) { - printk(KERN_ERR "hfs: xattr exists yet\n"); + pr_err("xattr exists yet\n"); err = -EOPNOTSUPP; goto end_setxattr; } @@ -177,7 +177,7 @@ int __hfsplus_setxattr(struct inode *inode, const char *name, goto end_setxattr; } else { if (flags & XATTR_REPLACE) { - printk(KERN_ERR "hfs: cannot replace xattr\n"); + pr_err("cannot replace xattr\n"); err = -EOPNOTSUPP; goto end_setxattr; } @@ -210,7 +210,7 @@ int __hfsplus_setxattr(struct inode *inode, const char *name, cat_entry_flags); hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY); } else { - printk(KERN_ERR "hfs: invalid catalog entry type\n"); + pr_err("invalid catalog entry type\n"); err = -EIO; goto end_setxattr; } @@ -269,7 +269,7 @@ static ssize_t hfsplus_getxattr_finder_info(struct dentry *dentry, if (size >= record_len) { res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); if (res) { - printk(KERN_ERR "hfs: can't init xattr find struct\n"); + pr_err("can't init xattr find struct\n"); return res; } res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); @@ -340,13 +340,13 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, entry = hfsplus_alloc_attr_entry(); if (!entry) { - printk(KERN_ERR "hfs: can't allocate xattr entry\n"); + pr_err("can't allocate xattr entry\n"); return -ENOMEM; } res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->attr_tree, &fd); if (res) { - printk(KERN_ERR "hfs: can't init xattr find struct\n"); + pr_err("can't init xattr find struct\n"); goto failed_getxattr_init; } @@ -355,7 +355,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, if (res == -ENOENT) res = -ENODATA; else - printk(KERN_ERR "hfs: xattr searching failed\n"); + pr_err("xattr searching failed\n"); goto out; } @@ -368,17 +368,17 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, offsetof(struct hfsplus_attr_inline_data, length)); if (record_length > HFSPLUS_MAX_INLINE_DATA_SIZE) { - printk(KERN_ERR "hfs: invalid xattr record size\n"); + pr_err("invalid xattr record size\n"); res = -EIO; goto out; } } else if (record_type == HFSPLUS_ATTR_FORK_DATA || record_type == HFSPLUS_ATTR_EXTENTS) { - printk(KERN_ERR "hfs: only inline data xattr are supported\n"); + pr_err("only inline data xattr are supported\n"); res = -EOPNOTSUPP; goto out; } else { - printk(KERN_ERR "hfs: invalid xattr record\n"); + pr_err("invalid xattr record\n"); res = -EIO; goto out; } @@ -427,7 +427,7 @@ static ssize_t hfsplus_listxattr_finder_info(struct dentry *dentry, res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); if (res) { - printk(KERN_ERR "hfs: can't init xattr find struct\n"); + pr_err("can't init xattr find struct\n"); return res; } @@ -506,7 +506,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->attr_tree, &fd); if (err) { - printk(KERN_ERR "hfs: can't init xattr find struct\n"); + pr_err("can't init xattr find struct\n"); return err; } @@ -525,8 +525,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) for (;;) { key_len = hfs_bnode_read_u16(fd.bnode, fd.keyoffset); if (key_len == 0 || key_len > fd.tree->max_key_len) { - printk(KERN_ERR "hfs: invalid xattr key length: %d\n", - key_len); + pr_err("invalid xattr key length: %d\n", key_len); res = -EIO; goto end_listxattr; } @@ -541,7 +540,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) if (hfsplus_uni2asc(inode->i_sb, (const struct hfsplus_unistr *)&fd.key->attr.key_name, strbuf, &xattr_name_len)) { - printk(KERN_ERR "hfs: unicode conversion failed\n"); + pr_err("unicode conversion failed\n"); res = -EIO; goto end_listxattr; } @@ -598,13 +597,13 @@ int hfsplus_removexattr(struct dentry *dentry, const char *name) err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &cat_fd); if (err) { - printk(KERN_ERR "hfs: can't init xattr find struct\n"); + pr_err("can't init xattr find struct\n"); return err; } err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &cat_fd); if (err) { - printk(KERN_ERR "hfs: catalog searching failed\n"); + pr_err("catalog searching failed\n"); goto end_removexattr; } @@ -643,7 +642,7 @@ int hfsplus_removexattr(struct dentry *dentry, const char *name) flags); hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY); } else { - printk(KERN_ERR "hfs: invalid catalog entry type\n"); + pr_err("invalid catalog entry type\n"); err = -EIO; goto end_removexattr; } diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index fbabb906066f..32f35f187989 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -7,6 +7,7 @@ */ #include <linux/fs.h> +#include <linux/magic.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/pagemap.h> @@ -45,8 +46,6 @@ static const struct dentry_operations hostfs_dentry_ops = { static char *root_ino = ""; static int append = 0; -#define HOSTFS_SUPER_MAGIC 0x00c0ffee - static const struct inode_operations hostfs_iops; static const struct inode_operations hostfs_dir_iops; static const struct inode_operations hostfs_link_iops; @@ -121,7 +120,7 @@ static char *dentry_name(struct dentry *dentry) if (!name) return NULL; - return __dentry_name(dentry, name); /* will unlock */ + return __dentry_name(dentry, name); } static char *inode_name(struct inode *ino) @@ -229,10 +228,11 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb) { struct hostfs_inode_info *hi; - hi = kzalloc(sizeof(*hi), GFP_KERNEL); + hi = kmalloc(sizeof(*hi), GFP_KERNEL); if (hi == NULL) return NULL; hi->fd = -1; + hi->mode = 0; inode_init_once(&hi->vfs_inode); return &hi->vfs_inode; } @@ -845,15 +845,8 @@ int hostfs_setattr(struct dentry *dentry, struct iattr *attr) return err; if ((attr->ia_valid & ATTR_SIZE) && - attr->ia_size != i_size_read(inode)) { - int error; - - error = inode_newsize_ok(inode, attr->ia_size); - if (error) - return error; - + attr->ia_size != i_size_read(inode)) truncate_setsize(inode, attr->ia_size); - } setattr_copy(inode, attr); mark_inode_dirty(inode); @@ -993,6 +986,7 @@ static struct file_system_type hostfs_type = { .kill_sb = hostfs_kill_sb, .fs_flags = 0, }; +MODULE_ALIAS_FS("hostfs"); static int __init init_hostfs(void) { diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 9f9dbeceeee7..3027f4dbbab5 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -131,6 +131,24 @@ static int hpfs_write_begin(struct file *file, struct address_space *mapping, return ret; } +static int hpfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *pagep, void *fsdata) +{ + struct inode *inode = mapping->host; + int err; + err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata); + if (err < len) + hpfs_write_failed(mapping, pos + len); + if (!(err < 0)) { + /* make sure we write it on close, if not earlier */ + hpfs_lock(inode->i_sb); + hpfs_i(inode)->i_dirty = 1; + hpfs_unlock(inode->i_sb); + } + return err; +} + static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block) { return generic_block_bmap(mapping,block,hpfs_get_block); @@ -140,30 +158,16 @@ const struct address_space_operations hpfs_aops = { .readpage = hpfs_readpage, .writepage = hpfs_writepage, .write_begin = hpfs_write_begin, - .write_end = generic_write_end, + .write_end = hpfs_write_end, .bmap = _hpfs_bmap }; -static ssize_t hpfs_file_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - ssize_t retval; - - retval = do_sync_write(file, buf, count, ppos); - if (retval > 0) { - hpfs_lock(file->f_path.dentry->d_sb); - hpfs_i(file_inode(file))->i_dirty = 1; - hpfs_unlock(file->f_path.dentry->d_sb); - } - return retval; -} - const struct file_operations hpfs_file_ops = { .llseek = generic_file_llseek, .read = do_sync_read, .aio_read = generic_file_aio_read, - .write = hpfs_file_write, + .write = do_sync_write, .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, .release = hpfs_file_release, diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index a3076228523d..a0617e706957 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -688,6 +688,7 @@ static struct file_system_type hpfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("hpfs"); static int __init init_hpfs_fs(void) { diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index 74f55703be49..cd3e38972c86 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -436,7 +436,6 @@ static int hppfs_open(struct inode *inode, struct file *file) path.mnt = inode->i_sb->s_fs_info; path.dentry = HPPFS_I(inode)->proc_dentry; - /* XXX This isn't closed anywhere */ data->proc_file = dentry_open(&path, file_mode(file->f_mode), cred); err = PTR_ERR(data->proc_file); if (IS_ERR(data->proc_file)) @@ -523,12 +522,23 @@ static loff_t hppfs_llseek(struct file *file, loff_t off, int where) return default_llseek(file, off, where); } +static int hppfs_release(struct inode *inode, struct file *file) +{ + struct hppfs_private *data = file->private_data; + struct file *proc_file = data->proc_file; + if (proc_file) + fput(proc_file); + kfree(data); + return 0; +} + static const struct file_operations hppfs_file_fops = { .owner = NULL, .llseek = hppfs_llseek, .read = hppfs_read, .write = hppfs_write, .open = hppfs_open, + .release = hppfs_release, }; struct hppfs_dirent { @@ -570,18 +580,12 @@ static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir) return err; } -static int hppfs_fsync(struct file *file, loff_t start, loff_t end, - int datasync) -{ - return filemap_write_and_wait_range(file->f_mapping, start, end); -} - static const struct file_operations hppfs_dir_fops = { .owner = NULL, .readdir = hppfs_readdir, .open = hppfs_dir_open, - .fsync = hppfs_fsync, .llseek = default_llseek, + .release = hppfs_release, }; static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf) @@ -748,6 +752,7 @@ static struct file_system_type hppfs_type = { .kill_sb = kill_anon_super, .fs_flags = 0, }; +MODULE_ALIAS_FS("hppfs"); static int __init init_hppfs(void) { diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 7f94e0cbc69c..a3f868ae3fd4 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -110,7 +110,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) * way when do_mmap_pgoff unwinds (may be important on powerpc * and ia64). */ - vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND | VM_DONTDUMP; + vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; vma->vm_ops = &hugetlb_vm_ops; if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) @@ -896,6 +896,7 @@ static struct file_system_type hugetlbfs_fs_type = { .mount = hugetlbfs_mount, .kill_sb = kill_litter_super, }; +MODULE_ALIAS_FS("hugetlbfs"); static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE]; @@ -908,11 +909,8 @@ static int can_do_hugetlb_shm(void) static int get_hstate_idx(int page_size_log) { - struct hstate *h; + struct hstate *h = hstate_sizelog(page_size_log); - if (!page_size_log) - return default_hstate_idx; - h = size_to_hstate(1 << page_size_log); if (!h) return -1; return h - hstates; @@ -928,9 +926,12 @@ static struct dentry_operations anon_ops = { .d_dname = hugetlb_dname }; -struct file *hugetlb_file_setup(const char *name, unsigned long addr, - size_t size, vm_flags_t acctflag, - struct user_struct **user, +/* + * Note that size should be aligned to proper hugepage size in caller side, + * otherwise hugetlb_reserve_pages reserves one less hugepages than intended. + */ +struct file *hugetlb_file_setup(const char *name, size_t size, + vm_flags_t acctflag, struct user_struct **user, int creat_flags, int page_size_log) { struct file *file = ERR_PTR(-ENOMEM); @@ -938,8 +939,6 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr, struct path path; struct super_block *sb; struct qstr quick_string; - struct hstate *hstate; - unsigned long num_pages; int hstate_idx; hstate_idx = get_hstate_idx(page_size_log); @@ -979,12 +978,10 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr, if (!inode) goto out_dentry; - hstate = hstate_inode(inode); - size += addr & ~huge_page_mask(hstate); - num_pages = ALIGN(size, huge_page_size(hstate)) >> - huge_page_shift(hstate); file = ERR_PTR(-ENOMEM); - if (hugetlb_reserve_pages(inode, 0, num_pages, NULL, acctflag)) + if (hugetlb_reserve_pages(inode, 0, + size >> huge_page_shift(hstate_inode(inode)), NULL, + acctflag)) goto out_inode; d_instantiate(path.dentry, inode); diff --git a/fs/inode.c b/fs/inode.c index f5f7c06c36fb..00d5fc3b86e1 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -725,7 +725,7 @@ void prune_icache_sb(struct super_block *sb, int nr_to_scan) * inode to the back of the list so we don't spin on it. */ if (!spin_trylock(&inode->i_lock)) { - list_move_tail(&inode->i_lru, &sb->s_inode_lru); + list_move(&inode->i_lru, &sb->s_inode_lru); continue; } @@ -1803,7 +1803,7 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) inode->i_fop = &def_blk_fops; inode->i_rdev = rdev; } else if (S_ISFIFO(mode)) - inode->i_fop = &def_fifo_fops; + inode->i_fop = &pipefifo_fops; else if (S_ISSOCK(mode)) inode->i_fop = &bad_sock_fops; else diff --git a/fs/internal.h b/fs/internal.h index 507141fceb99..eaa75f75b625 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -125,3 +125,13 @@ extern int invalidate_inodes(struct super_block *, bool); * dcache.c */ extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); + +/* + * read_write.c + */ +extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *); + +/* + * pipe.c + */ +extern const struct file_operations pipefifo_fops; diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 67ce52507d7d..d9b8aebdeb22 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -1556,6 +1556,8 @@ static struct file_system_type iso9660_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("iso9660"); +MODULE_ALIAS("iso9660"); static int __init init_iso9660_fs(void) { @@ -1593,5 +1595,3 @@ static void __exit exit_iso9660_fs(void) module_init(init_iso9660_fs) module_exit(exit_iso9660_fs) MODULE_LICENSE("GPL"); -/* Actual filesystem name is iso9660, as requested in filesystems.c */ -MODULE_ALIAS("iso9660"); diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 86b39b167c23..11bb11f48b3a 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -162,8 +162,17 @@ static void journal_do_submit_data(struct buffer_head **wbuf, int bufs, for (i = 0; i < bufs; i++) { wbuf[i]->b_end_io = end_buffer_write_sync; - /* We use-up our safety reference in submit_bh() */ - submit_bh(write_op, wbuf[i]); + /* + * Here we write back pagecache data that may be mmaped. Since + * we cannot afford to clean the page and set PageWriteback + * here due to lock ordering (page lock ranks above transaction + * start), the data can change while IO is in flight. Tell the + * block layer it should bounce the bio pages if stable data + * during write is required. + * + * We use up our safety reference in submit_bh(). + */ + _submit_bh(write_op, wbuf[i], 1 << BIO_SNAP_STABLE); } } @@ -667,7 +676,17 @@ start_journal_io: clear_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; - submit_bh(write_op, bh); + /* + * In data=journal mode, here we can end up + * writing pagecache data that might be + * mmapped. Since we can't afford to clean the + * page and set PageWriteback (see the comment + * near the other use of _submit_bh()), the + * data can change while the write is in + * flight. Tell the block layer to bounce the + * bio pages if stable pages are required. + */ + _submit_bh(write_op, bh, 1 << BIO_SNAP_STABLE); } cond_resched(); diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 81cc7eaff863..6510d6355729 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -310,8 +310,6 @@ int journal_write_metadata_buffer(transaction_t *transaction, new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); /* keep subsequent assertions sane */ - new_bh->b_state = 0; - init_buffer(new_bh, NULL, NULL); atomic_set(&new_bh->b_count, 1); new_jh = journal_add_journal_head(new_bh); /* This sleeps */ @@ -564,6 +562,16 @@ int log_wait_commit(journal_t *journal, tid_t tid) spin_unlock(&journal->j_state_lock); #endif spin_lock(&journal->j_state_lock); + /* + * Not running or committing trans? Must be already committed. This + * saves us from waiting for a *long* time when tid overflows. + */ + if (!((journal->j_running_transaction && + journal->j_running_transaction->t_tid == tid) || + (journal->j_committing_transaction && + journal->j_committing_transaction->t_tid == tid))) + goto out_unlock; + if (!tid_geq(journal->j_commit_waited, tid)) journal->j_commit_waited = tid; while (tid_gt(tid, journal->j_commit_sequence)) { @@ -575,6 +583,7 @@ int log_wait_commit(journal_t *journal, tid_t tid) !tid_gt(tid, journal->j_commit_sequence)); spin_lock(&journal->j_state_lock); } +out_unlock: spin_unlock(&journal->j_state_lock); if (unlikely(is_journal_aborted(journal))) { @@ -1845,7 +1854,7 @@ static struct journal_head *journal_alloc_journal_head(void) #ifdef CONFIG_JBD_DEBUG atomic_inc(&nr_journal_heads); #endif - ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); + ret = kmem_cache_zalloc(journal_head_cache, GFP_NOFS); if (ret == NULL) { jbd_debug(1, "out of memory for journal_head\n"); printk_ratelimited(KERN_NOTICE "ENOMEM in %s, retrying.\n", @@ -1853,7 +1862,7 @@ static struct journal_head *journal_alloc_journal_head(void) while (ret == NULL) { yield(); - ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); + ret = kmem_cache_zalloc(journal_head_cache, GFP_NOFS); } } return ret; @@ -1915,10 +1924,8 @@ struct journal_head *journal_add_journal_head(struct buffer_head *bh) struct journal_head *new_jh = NULL; repeat: - if (!buffer_jbd(bh)) { + if (!buffer_jbd(bh)) new_jh = journal_alloc_journal_head(); - memset(new_jh, 0, sizeof(*new_jh)); - } jbd_lock_bh_journal_head(bh); if (buffer_jbd(bh)) { diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 071d6905f0dd..e3e255c0a509 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -245,7 +245,6 @@ static handle_t *new_handle(int nblocks) handle_t *handle = jbd_alloc_handle(GFP_NOFS); if (!handle) return NULL; - memset(handle, 0, sizeof(*handle)); handle->h_buffer_credits = nblocks; handle->h_ref = 1; diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 750c70148eff..0f53946f13c1 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -382,7 +382,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) int space_left = 0; int first_tag = 0; int tag_flag; - int i, to_free = 0; + int i; int tag_bytes = journal_tag_bytes(journal); struct buffer_head *cbh = NULL; /* For transactional checksums */ __u32 crc32_sum = ~0; @@ -1134,7 +1134,7 @@ restart_loop: journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; spin_unlock(&journal->j_history_lock); - commit_transaction->t_state = T_FINISHED; + commit_transaction->t_state = T_COMMIT_CALLBACK; J_ASSERT(commit_transaction == journal->j_committing_transaction); journal->j_commit_sequence = commit_transaction->t_tid; journal->j_committing_transaction = NULL; @@ -1149,38 +1149,44 @@ restart_loop: journal->j_average_commit_time*3) / 4; else journal->j_average_commit_time = commit_time; + write_unlock(&journal->j_state_lock); - if (commit_transaction->t_checkpoint_list == NULL && - commit_transaction->t_checkpoint_io_list == NULL) { - __jbd2_journal_drop_transaction(journal, commit_transaction); - to_free = 1; + if (journal->j_checkpoint_transactions == NULL) { + journal->j_checkpoint_transactions = commit_transaction; + commit_transaction->t_cpnext = commit_transaction; + commit_transaction->t_cpprev = commit_transaction; } else { - if (journal->j_checkpoint_transactions == NULL) { - journal->j_checkpoint_transactions = commit_transaction; - commit_transaction->t_cpnext = commit_transaction; - commit_transaction->t_cpprev = commit_transaction; - } else { - commit_transaction->t_cpnext = - journal->j_checkpoint_transactions; - commit_transaction->t_cpprev = - commit_transaction->t_cpnext->t_cpprev; - commit_transaction->t_cpnext->t_cpprev = - commit_transaction; - commit_transaction->t_cpprev->t_cpnext = + commit_transaction->t_cpnext = + journal->j_checkpoint_transactions; + commit_transaction->t_cpprev = + commit_transaction->t_cpnext->t_cpprev; + commit_transaction->t_cpnext->t_cpprev = + commit_transaction; + commit_transaction->t_cpprev->t_cpnext = commit_transaction; - } } spin_unlock(&journal->j_list_lock); - + /* Drop all spin_locks because commit_callback may be block. + * __journal_remove_checkpoint() can not destroy transaction + * under us because it is not marked as T_FINISHED yet */ if (journal->j_commit_callback) journal->j_commit_callback(journal, commit_transaction); trace_jbd2_end_commit(journal, commit_transaction); jbd_debug(1, "JBD2: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); - if (to_free) - jbd2_journal_free_transaction(commit_transaction); + write_lock(&journal->j_state_lock); + spin_lock(&journal->j_list_lock); + commit_transaction->t_state = T_FINISHED; + /* Recheck checkpoint lists after j_list_lock was dropped */ + if (commit_transaction->t_checkpoint_list == NULL && + commit_transaction->t_checkpoint_io_list == NULL) { + __jbd2_journal_drop_transaction(journal, commit_transaction); + jbd2_journal_free_transaction(commit_transaction); + } + spin_unlock(&journal->j_list_lock); + write_unlock(&journal->j_state_lock); wake_up(&journal->j_wait_done_commit); } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index ed10991ab006..95457576e434 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -367,8 +367,6 @@ retry_alloc: } /* keep subsequent assertions sane */ - new_bh->b_state = 0; - init_buffer(new_bh, NULL, NULL); atomic_set(&new_bh->b_count, 1); new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */ @@ -710,6 +708,37 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) } /* + * When this function returns the transaction corresponding to tid + * will be completed. If the transaction has currently running, start + * committing that transaction before waiting for it to complete. If + * the transaction id is stale, it is by definition already completed, + * so just return SUCCESS. + */ +int jbd2_complete_transaction(journal_t *journal, tid_t tid) +{ + int need_to_wait = 1; + + read_lock(&journal->j_state_lock); + if (journal->j_running_transaction && + journal->j_running_transaction->t_tid == tid) { + if (journal->j_commit_request != tid) { + /* transaction not yet started, so request it */ + read_unlock(&journal->j_state_lock); + jbd2_log_start_commit(journal, tid); + goto wait_commit; + } + } else if (!(journal->j_committing_transaction && + journal->j_committing_transaction->t_tid == tid)) + need_to_wait = 0; + read_unlock(&journal->j_state_lock); + if (!need_to_wait) + return 0; +wait_commit: + return jbd2_log_wait_commit(journal, tid); +} +EXPORT_SYMBOL(jbd2_complete_transaction); + +/* * Log buffer allocation routines: */ @@ -950,7 +979,7 @@ static const struct seq_operations jbd2_seq_info_ops = { static int jbd2_seq_info_open(struct inode *inode, struct file *file) { - journal_t *journal = PDE(inode)->data; + journal_t *journal = PDE_DATA(inode); struct jbd2_stats_proc_session *s; int rc, size; diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index d6ee5aed56b1..10f524c59ea8 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -332,7 +332,6 @@ static handle_t *new_handle(int nblocks) handle_t *handle = jbd2_alloc_handle(GFP_NOFS); if (!handle) return NULL; - memset(handle, 0, sizeof(*handle)); handle->h_buffer_credits = nblocks; handle->h_ref = 1; @@ -640,6 +639,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh, int error; char *frozen_buffer = NULL; int need_copy = 0; + unsigned long start_lock, time_lock; if (is_handle_aborted(handle)) return -EROFS; @@ -655,9 +655,16 @@ repeat: /* @@@ Need to check for errors here at some point. */ + start_lock = jiffies; lock_buffer(bh); jbd_lock_bh_state(bh); + /* If it takes too long to lock the buffer, trace it */ + time_lock = jbd2_time_diff(start_lock, jiffies); + if (time_lock > HZ/10) + trace_jbd2_lock_buffer_stall(bh->b_bdev->bd_dev, + jiffies_to_msecs(time_lock)); + /* We now hold the buffer lock so it is safe to query the buffer * state. Is the buffer dirty? * @@ -1065,9 +1072,12 @@ out: void jbd2_journal_set_triggers(struct buffer_head *bh, struct jbd2_buffer_trigger_type *type) { - struct journal_head *jh = bh2jh(bh); + struct journal_head *jh = jbd2_journal_grab_journal_head(bh); + if (WARN_ON(!jh)) + return; jh->b_triggers = type; + jbd2_journal_put_journal_head(jh); } void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data, @@ -1119,17 +1129,18 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; - struct journal_head *jh = bh2jh(bh); + struct journal_head *jh; int ret = 0; - jbd_debug(5, "journal_head %p\n", jh); - JBUFFER_TRACE(jh, "entry"); if (is_handle_aborted(handle)) goto out; - if (!buffer_jbd(bh)) { + jh = jbd2_journal_grab_journal_head(bh); + if (!jh) { ret = -EUCLEAN; goto out; } + jbd_debug(5, "journal_head %p\n", jh); + JBUFFER_TRACE(jh, "entry"); jbd_lock_bh_state(bh); @@ -1220,6 +1231,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) spin_unlock(&journal->j_list_lock); out_unlock_bh: jbd_unlock_bh_state(bh); + jbd2_journal_put_journal_head(jh); out: JBUFFER_TRACE(jh, "exit"); WARN_ON(ret); /* All errors are bugs, so dump the stack */ diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index d3d8799e2187..0defb1cc2a35 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -356,6 +356,7 @@ static struct file_system_type jffs2_fs_type = { .mount = jffs2_mount, .kill_sb = jffs2_kill_sb, }; +MODULE_ALIAS_FS("jffs2"); static int __init init_jffs2_fs(void) { diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index b7dc47ba675e..730f24e282a6 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -23,6 +23,7 @@ #include <linux/pagemap.h> #include <linux/quotaops.h> #include <linux/writeback.h> +#include <linux/aio.h> #include "jfs_incore.h" #include "jfs_inode.h" #include "jfs_filsys.h" @@ -125,7 +126,7 @@ int jfs_write_inode(struct inode *inode, struct writeback_control *wbc) { int wait = wbc->sync_mode == WB_SYNC_ALL; - if (test_cflag(COMMIT_Nolink, inode)) + if (inode->i_nlink == 0) return 0; /* * If COMMIT_DIRTY is not set, the inode isn't really dirty. diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index 6ba4006e011b..f7e042b63ddb 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -1493,7 +1493,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip) /* mask any prior bits for the starting words of the * summary map. */ - mask = ONES << (EXTSPERSUM - bitno); + mask = (bitno == 0) ? 0 : (ONES << (EXTSPERSUM - bitno)); inosmap = le32_to_cpu(iagp->inosmap[sword]) | mask; extsmap = le32_to_cpu(iagp->extsmap[sword]) | mask; diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 2eb952c41a69..c57499dca89c 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1058,7 +1058,8 @@ static int lmLogSync(struct jfs_log * log, int hard_sync) */ void jfs_syncpt(struct jfs_log *log, int hard_sync) { LOG_LOCK(log); - lmLogSync(log, hard_sync); + if (!test_bit(log_QUIESCE, &log->flag)) + lmLogSync(log, hard_sync); LOG_UNLOCK(log); } @@ -2004,7 +2005,6 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) bio->bi_io_vec[0].bv_offset = bp->l_offset; bio->bi_vcnt = 1; - bio->bi_idx = 0; bio->bi_size = LOGPSIZE; bio->bi_end_io = lbmIODone; @@ -2145,7 +2145,6 @@ static void lbmStartIO(struct lbuf * bp) bio->bi_io_vec[0].bv_offset = bp->l_offset; bio->bi_vcnt = 1; - bio->bi_idx = 0; bio->bi_size = LOGPSIZE; bio->bi_end_io = lbmIODone; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 060ba638becb..2003e830ed1c 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -833,6 +833,7 @@ static struct file_system_type jfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("jfs"); static void init_once(void *foo) { diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 0796c45d0d4d..01bfe7662751 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -144,6 +144,9 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout) timeout); if (ret < 0) return -ERESTARTSYS; + /* Reset the lock status after a server reboot so we resend */ + if (block->b_status == nlm_lck_denied_grace_period) + block->b_status = nlm_lck_blocked; req->a_res.status = block->b_status; return 0; } diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 7e529c3c45c0..9760ecb9b60f 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -550,9 +550,6 @@ again: status = nlmclnt_block(block, req, NLMCLNT_POLL_TIMEOUT); if (status < 0) break; - /* Resend the blocking lock request after a server reboot */ - if (resp->status == nlm_lck_denied_grace_period) - continue; if (resp->status != nlm_lck_blocked) break; } diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c index e784a217b500..550475ca6a0e 100644 --- a/fs/logfs/dev_bdev.c +++ b/fs/logfs/dev_bdev.c @@ -32,7 +32,6 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw) bio_vec.bv_len = PAGE_SIZE; bio_vec.bv_offset = 0; bio.bi_vcnt = 1; - bio.bi_idx = 0; bio.bi_size = PAGE_SIZE; bio.bi_bdev = bdev; bio.bi_sector = page->index * (PAGE_SIZE >> 9); @@ -108,7 +107,6 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, if (i >= max_pages) { /* Block layer cannot split bios :( */ bio->bi_vcnt = i; - bio->bi_idx = 0; bio->bi_size = i * PAGE_SIZE; bio->bi_bdev = super->s_bdev; bio->bi_sector = ofs >> 9; @@ -136,7 +134,6 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, unlock_page(page); } bio->bi_vcnt = nr_pages; - bio->bi_idx = 0; bio->bi_size = nr_pages * PAGE_SIZE; bio->bi_bdev = super->s_bdev; bio->bi_sector = ofs >> 9; @@ -202,7 +199,6 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index, if (i >= max_pages) { /* Block layer cannot split bios :( */ bio->bi_vcnt = i; - bio->bi_idx = 0; bio->bi_size = i * PAGE_SIZE; bio->bi_bdev = super->s_bdev; bio->bi_sector = ofs >> 9; @@ -224,7 +220,6 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index, bio->bi_io_vec[i].bv_offset = 0; } bio->bi_vcnt = nr_pages; - bio->bi_idx = 0; bio->bi_size = nr_pages * PAGE_SIZE; bio->bi_bdev = super->s_bdev; bio->bi_sector = ofs >> 9; diff --git a/fs/logfs/super.c b/fs/logfs/super.c index 345c24b8a6f8..54360293bcb5 100644 --- a/fs/logfs/super.c +++ b/fs/logfs/super.c @@ -608,6 +608,7 @@ static struct file_system_type logfs_fs_type = { .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("logfs"); static int __init logfs_init(void) { diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 99541cceb584..df122496f328 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -660,6 +660,7 @@ static struct file_system_type minix_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("minix"); static int __init init_minix_fs(void) { diff --git a/fs/mount.h b/fs/mount.h index cd5007980400..64a858143ff9 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -18,6 +18,12 @@ struct mnt_pcp { int mnt_writers; }; +struct mountpoint { + struct list_head m_hash; + struct dentry *m_dentry; + int m_count; +}; + struct mount { struct list_head mnt_hash; struct mount *mnt_parent; @@ -40,6 +46,7 @@ struct mount { struct list_head mnt_slave; /* slave list entry */ struct mount *mnt_master; /* slave is on master->mnt_slave_list */ struct mnt_namespace *mnt_ns; /* containing namespace */ + struct mountpoint *mnt_mp; /* where is it mounted */ #ifdef CONFIG_FSNOTIFY struct hlist_head mnt_fsnotify_marks; __u32 mnt_fsnotify_mask; diff --git a/fs/namei.c b/fs/namei.c index 961bc1268366..57ae9c8c66bf 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -689,8 +689,6 @@ void nd_jump_link(struct nameidata *nd, struct path *path) nd->path = *path; nd->inode = nd->path.dentry->d_inode; nd->flags |= LOOKUP_JUMPED; - - BUG_ON(nd->inode->i_op->follow_link); } static inline void put_link(struct nameidata *nd, struct path *link, void *cookie) diff --git a/fs/namespace.c b/fs/namespace.c index 50ca17d3cb45..7b1ca9ba0b0a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -21,7 +21,8 @@ #include <linux/fs_struct.h> /* get_fs_root et.al. */ #include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */ #include <linux/uaccess.h> -#include <linux/proc_fs.h> +#include <linux/proc_ns.h> +#include <linux/magic.h> #include "pnode.h" #include "internal.h" @@ -36,6 +37,7 @@ static int mnt_id_start = 0; static int mnt_group_start = 1; static struct list_head *mount_hashtable __read_mostly; +static struct list_head *mountpoint_hashtable __read_mostly; static struct kmem_cache *mnt_cache __read_mostly; static struct rw_semaphore namespace_sem; @@ -605,6 +607,51 @@ struct vfsmount *lookup_mnt(struct path *path) } } +static struct mountpoint *new_mountpoint(struct dentry *dentry) +{ + struct list_head *chain = mountpoint_hashtable + hash(NULL, dentry); + struct mountpoint *mp; + + list_for_each_entry(mp, chain, m_hash) { + if (mp->m_dentry == dentry) { + /* might be worth a WARN_ON() */ + if (d_unlinked(dentry)) + return ERR_PTR(-ENOENT); + mp->m_count++; + return mp; + } + } + + mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); + if (!mp) + return ERR_PTR(-ENOMEM); + + spin_lock(&dentry->d_lock); + if (d_unlinked(dentry)) { + spin_unlock(&dentry->d_lock); + kfree(mp); + return ERR_PTR(-ENOENT); + } + dentry->d_flags |= DCACHE_MOUNTED; + spin_unlock(&dentry->d_lock); + mp->m_dentry = dentry; + mp->m_count = 1; + list_add(&mp->m_hash, chain); + return mp; +} + +static void put_mountpoint(struct mountpoint *mp) +{ + if (!--mp->m_count) { + struct dentry *dentry = mp->m_dentry; + spin_lock(&dentry->d_lock); + dentry->d_flags &= ~DCACHE_MOUNTED; + spin_unlock(&dentry->d_lock); + list_del(&mp->m_hash); + kfree(mp); + } +} + static inline int check_mnt(struct mount *mnt) { return mnt->mnt_ns == current->nsproxy->mnt_ns; @@ -633,27 +680,6 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns) } /* - * Clear dentry's mounted state if it has no remaining mounts. - * vfsmount_lock must be held for write. - */ -static void dentry_reset_mounted(struct dentry *dentry) -{ - unsigned u; - - for (u = 0; u < HASH_SIZE; u++) { - struct mount *p; - - list_for_each_entry(p, &mount_hashtable[u], mnt_hash) { - if (p->mnt_mountpoint == dentry) - return; - } - } - spin_lock(&dentry->d_lock); - dentry->d_flags &= ~DCACHE_MOUNTED; - spin_unlock(&dentry->d_lock); -} - -/* * vfsmount lock must be held for write */ static void detach_mnt(struct mount *mnt, struct path *old_path) @@ -664,32 +690,35 @@ static void detach_mnt(struct mount *mnt, struct path *old_path) mnt->mnt_mountpoint = mnt->mnt.mnt_root; list_del_init(&mnt->mnt_child); list_del_init(&mnt->mnt_hash); - dentry_reset_mounted(old_path->dentry); + put_mountpoint(mnt->mnt_mp); + mnt->mnt_mp = NULL; } /* * vfsmount lock must be held for write */ -void mnt_set_mountpoint(struct mount *mnt, struct dentry *dentry, +void mnt_set_mountpoint(struct mount *mnt, + struct mountpoint *mp, struct mount *child_mnt) { + mp->m_count++; mnt_add_count(mnt, 1); /* essentially, that's mntget */ - child_mnt->mnt_mountpoint = dget(dentry); + child_mnt->mnt_mountpoint = dget(mp->m_dentry); child_mnt->mnt_parent = mnt; - spin_lock(&dentry->d_lock); - dentry->d_flags |= DCACHE_MOUNTED; - spin_unlock(&dentry->d_lock); + child_mnt->mnt_mp = mp; } /* * vfsmount lock must be held for write */ -static void attach_mnt(struct mount *mnt, struct path *path) +static void attach_mnt(struct mount *mnt, + struct mount *parent, + struct mountpoint *mp) { - mnt_set_mountpoint(real_mount(path->mnt), path->dentry, mnt); + mnt_set_mountpoint(parent, mp, mnt); list_add_tail(&mnt->mnt_hash, mount_hashtable + - hash(path->mnt, path->dentry)); - list_add_tail(&mnt->mnt_child, &real_mount(path->mnt)->mnt_mounts); + hash(&parent->mnt, mp->m_dentry)); + list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); } /* @@ -798,6 +827,10 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, } mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~MNT_WRITE_HOLD; + /* Don't allow unprivileged users to change mount flags */ + if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY)) + mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; + atomic_inc(&sb->s_active); mnt->mnt.mnt_sb = sb; mnt->mnt.mnt_root = dget(root); @@ -1091,11 +1124,23 @@ int may_umount(struct vfsmount *mnt) EXPORT_SYMBOL(may_umount); -void release_mounts(struct list_head *head) +static LIST_HEAD(unmounted); /* protected by namespace_sem */ + +static void namespace_unlock(void) { struct mount *mnt; - while (!list_empty(head)) { - mnt = list_first_entry(head, struct mount, mnt_hash); + LIST_HEAD(head); + + if (likely(list_empty(&unmounted))) { + up_write(&namespace_sem); + return; + } + + list_splice_init(&unmounted, &head); + up_write(&namespace_sem); + + while (!list_empty(&head)) { + mnt = list_first_entry(&head, struct mount, mnt_hash); list_del_init(&mnt->mnt_hash); if (mnt_has_parent(mnt)) { struct dentry *dentry; @@ -1115,11 +1160,16 @@ void release_mounts(struct list_head *head) } } +static inline void namespace_lock(void) +{ + down_write(&namespace_sem); +} + /* * vfsmount lock must be held for write * namespace_sem must be held for write */ -void umount_tree(struct mount *mnt, int propagate, struct list_head *kill) +void umount_tree(struct mount *mnt, int propagate) { LIST_HEAD(tmp_list); struct mount *p; @@ -1138,20 +1188,20 @@ void umount_tree(struct mount *mnt, int propagate, struct list_head *kill) list_del_init(&p->mnt_child); if (mnt_has_parent(p)) { p->mnt_parent->mnt_ghosts++; - dentry_reset_mounted(p->mnt_mountpoint); + put_mountpoint(p->mnt_mp); + p->mnt_mp = NULL; } change_mnt_propagation(p, MS_PRIVATE); } - list_splice(&tmp_list, kill); + list_splice(&tmp_list, &unmounted); } -static void shrink_submounts(struct mount *mnt, struct list_head *umounts); +static void shrink_submounts(struct mount *mnt); static int do_umount(struct mount *mnt, int flags) { struct super_block *sb = mnt->mnt.mnt_sb; int retval; - LIST_HEAD(umount_list); retval = security_sb_umount(&mnt->mnt, flags); if (retval) @@ -1218,22 +1268,21 @@ static int do_umount(struct mount *mnt, int flags) return retval; } - down_write(&namespace_sem); + namespace_lock(); br_write_lock(&vfsmount_lock); event++; if (!(flags & MNT_DETACH)) - shrink_submounts(mnt, &umount_list); + shrink_submounts(mnt); retval = -EBUSY; if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) { if (!list_empty(&mnt->mnt_list)) - umount_tree(mnt, 1, &umount_list); + umount_tree(mnt, 1); retval = 0; } br_write_unlock(&vfsmount_lock); - up_write(&namespace_sem); - release_mounts(&umount_list); + namespace_unlock(); return retval; } @@ -1306,13 +1355,13 @@ static bool mnt_ns_loop(struct path *path) * mount namespace loop? */ struct inode *inode = path->dentry->d_inode; - struct proc_inode *ei; + struct proc_ns *ei; struct mnt_namespace *mnt_ns; if (!proc_ns_inode(inode)) return false; - ei = PROC_I(inode); + ei = get_proc_ns(inode); if (ei->ns_ops != &mntns_operations) return false; @@ -1323,8 +1372,7 @@ static bool mnt_ns_loop(struct path *path) struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, int flag) { - struct mount *res, *p, *q, *r; - struct path path; + struct mount *res, *p, *q, *r, *parent; if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt)) return ERR_PTR(-EINVAL); @@ -1351,25 +1399,22 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, q = q->mnt_parent; } p = s; - path.mnt = &q->mnt; - path.dentry = p->mnt_mountpoint; + parent = q; q = clone_mnt(p, p->mnt.mnt_root, flag); if (IS_ERR(q)) goto out; br_write_lock(&vfsmount_lock); list_add_tail(&q->mnt_list, &res->mnt_list); - attach_mnt(q, &path); + attach_mnt(q, parent, p->mnt_mp); br_write_unlock(&vfsmount_lock); } } return res; out: if (res) { - LIST_HEAD(umount_list); br_write_lock(&vfsmount_lock); - umount_tree(res, 0, &umount_list); + umount_tree(res, 0); br_write_unlock(&vfsmount_lock); - release_mounts(&umount_list); } return q; } @@ -1379,10 +1424,10 @@ out: struct vfsmount *collect_mounts(struct path *path) { struct mount *tree; - down_write(&namespace_sem); + namespace_lock(); tree = copy_tree(real_mount(path->mnt), path->dentry, CL_COPY_ALL | CL_PRIVATE); - up_write(&namespace_sem); + namespace_unlock(); if (IS_ERR(tree)) return NULL; return &tree->mnt; @@ -1390,13 +1435,11 @@ struct vfsmount *collect_mounts(struct path *path) void drop_collected_mounts(struct vfsmount *mnt) { - LIST_HEAD(umount_list); - down_write(&namespace_sem); + namespace_lock(); br_write_lock(&vfsmount_lock); - umount_tree(real_mount(mnt), 0, &umount_list); + umount_tree(real_mount(mnt), 0); br_write_unlock(&vfsmount_lock); - up_write(&namespace_sem); - release_mounts(&umount_list); + namespace_unlock(); } int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, @@ -1505,11 +1548,11 @@ static int invent_group_ids(struct mount *mnt, bool recurse) * in allocations. */ static int attach_recursive_mnt(struct mount *source_mnt, - struct path *path, struct path *parent_path) + struct mount *dest_mnt, + struct mountpoint *dest_mp, + struct path *parent_path) { LIST_HEAD(tree_list); - struct mount *dest_mnt = real_mount(path->mnt); - struct dentry *dest_dentry = path->dentry; struct mount *child, *p; int err; @@ -1518,7 +1561,7 @@ static int attach_recursive_mnt(struct mount *source_mnt, if (err) goto out; } - err = propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list); + err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); if (err) goto out_cleanup_ids; @@ -1530,10 +1573,10 @@ static int attach_recursive_mnt(struct mount *source_mnt, } if (parent_path) { detach_mnt(source_mnt, parent_path); - attach_mnt(source_mnt, path); + attach_mnt(source_mnt, dest_mnt, dest_mp); touch_mnt_namespace(source_mnt->mnt_ns); } else { - mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt); + mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); commit_tree(source_mnt); } @@ -1552,46 +1595,53 @@ static int attach_recursive_mnt(struct mount *source_mnt, return err; } -static int lock_mount(struct path *path) +static struct mountpoint *lock_mount(struct path *path) { struct vfsmount *mnt; + struct dentry *dentry = path->dentry; retry: - mutex_lock(&path->dentry->d_inode->i_mutex); - if (unlikely(cant_mount(path->dentry))) { - mutex_unlock(&path->dentry->d_inode->i_mutex); - return -ENOENT; + mutex_lock(&dentry->d_inode->i_mutex); + if (unlikely(cant_mount(dentry))) { + mutex_unlock(&dentry->d_inode->i_mutex); + return ERR_PTR(-ENOENT); } - down_write(&namespace_sem); + namespace_lock(); mnt = lookup_mnt(path); - if (likely(!mnt)) - return 0; - up_write(&namespace_sem); + if (likely(!mnt)) { + struct mountpoint *mp = new_mountpoint(dentry); + if (IS_ERR(mp)) { + namespace_unlock(); + mutex_unlock(&dentry->d_inode->i_mutex); + return mp; + } + return mp; + } + namespace_unlock(); mutex_unlock(&path->dentry->d_inode->i_mutex); path_put(path); path->mnt = mnt; - path->dentry = dget(mnt->mnt_root); + dentry = path->dentry = dget(mnt->mnt_root); goto retry; } -static void unlock_mount(struct path *path) +static void unlock_mount(struct mountpoint *where) { - up_write(&namespace_sem); - mutex_unlock(&path->dentry->d_inode->i_mutex); + struct dentry *dentry = where->m_dentry; + put_mountpoint(where); + namespace_unlock(); + mutex_unlock(&dentry->d_inode->i_mutex); } -static int graft_tree(struct mount *mnt, struct path *path) +static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) { if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER) return -EINVAL; - if (S_ISDIR(path->dentry->d_inode->i_mode) != + if (S_ISDIR(mp->m_dentry->d_inode->i_mode) != S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode)) return -ENOTDIR; - if (d_unlinked(path->dentry)) - return -ENOENT; - - return attach_recursive_mnt(mnt, path, NULL); + return attach_recursive_mnt(mnt, p, mp, NULL); } /* @@ -1629,7 +1679,7 @@ static int do_change_type(struct path *path, int flag) if (!type) return -EINVAL; - down_write(&namespace_sem); + namespace_lock(); if (type == MS_SHARED) { err = invent_group_ids(mnt, recurse); if (err) @@ -1642,7 +1692,7 @@ static int do_change_type(struct path *path, int flag) br_write_unlock(&vfsmount_lock); out_unlock: - up_write(&namespace_sem); + namespace_unlock(); return err; } @@ -1652,9 +1702,9 @@ static int do_change_type(struct path *path, int flag) static int do_loopback(struct path *path, const char *old_name, int recurse) { - LIST_HEAD(umount_list); struct path old_path; - struct mount *mnt = NULL, *old; + struct mount *mnt = NULL, *old, *parent; + struct mountpoint *mp; int err; if (!old_name || !*old_name) return -EINVAL; @@ -1666,17 +1716,19 @@ static int do_loopback(struct path *path, const char *old_name, if (mnt_ns_loop(&old_path)) goto out; - err = lock_mount(path); - if (err) + mp = lock_mount(path); + err = PTR_ERR(mp); + if (IS_ERR(mp)) goto out; old = real_mount(old_path.mnt); + parent = real_mount(path->mnt); err = -EINVAL; if (IS_MNT_UNBINDABLE(old)) goto out2; - if (!check_mnt(real_mount(path->mnt)) || !check_mnt(old)) + if (!check_mnt(parent) || !check_mnt(old)) goto out2; if (recurse) @@ -1686,18 +1738,17 @@ static int do_loopback(struct path *path, const char *old_name, if (IS_ERR(mnt)) { err = PTR_ERR(mnt); - goto out; + goto out2; } - err = graft_tree(mnt, path); + err = graft_tree(mnt, parent, mp); if (err) { br_write_lock(&vfsmount_lock); - umount_tree(mnt, 0, &umount_list); + umount_tree(mnt, 0); br_write_unlock(&vfsmount_lock); } out2: - unlock_mount(path); - release_mounts(&umount_list); + unlock_mount(mp); out: path_put(&old_path); return err; @@ -1713,6 +1764,9 @@ static int change_mount_flags(struct vfsmount *mnt, int ms_flags) if (readonly_request == __mnt_is_readonly(mnt)) return 0; + if (mnt->mnt_flags & MNT_LOCK_READONLY) + return -EPERM; + if (readonly_request) error = mnt_make_readonly(real_mount(mnt)); else @@ -1779,6 +1833,7 @@ static int do_move_mount(struct path *path, const char *old_name) struct path old_path, parent_path; struct mount *p; struct mount *old; + struct mountpoint *mp; int err; if (!old_name || !*old_name) return -EINVAL; @@ -1786,8 +1841,9 @@ static int do_move_mount(struct path *path, const char *old_name) if (err) return err; - err = lock_mount(path); - if (err < 0) + mp = lock_mount(path); + err = PTR_ERR(mp); + if (IS_ERR(mp)) goto out; old = real_mount(old_path.mnt); @@ -1797,9 +1853,6 @@ static int do_move_mount(struct path *path, const char *old_name) if (!check_mnt(p) || !check_mnt(old)) goto out1; - if (d_unlinked(path->dentry)) - goto out1; - err = -EINVAL; if (old_path.dentry != old_path.mnt->mnt_root) goto out1; @@ -1826,7 +1879,7 @@ static int do_move_mount(struct path *path, const char *old_name) if (p == old) goto out1; - err = attach_recursive_mnt(old, path, &parent_path); + err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path); if (err) goto out1; @@ -1834,7 +1887,7 @@ static int do_move_mount(struct path *path, const char *old_name) * automatically */ list_del_init(&old->mnt_expire); out1: - unlock_mount(path); + unlock_mount(mp); out: if (!err) path_put(&parent_path); @@ -1870,21 +1923,24 @@ static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) */ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) { + struct mountpoint *mp; + struct mount *parent; int err; mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL); - err = lock_mount(path); - if (err) - return err; + mp = lock_mount(path); + if (IS_ERR(mp)) + return PTR_ERR(mp); + parent = real_mount(path->mnt); err = -EINVAL; - if (unlikely(!check_mnt(real_mount(path->mnt)))) { + if (unlikely(!check_mnt(parent))) { /* that's acceptable only for automounts done in private ns */ if (!(mnt_flags & MNT_SHRINKABLE)) goto unlock; /* ... and for those we'd better have mountpoint still alive */ - if (!real_mount(path->mnt)->mnt_ns) + if (!parent->mnt_ns) goto unlock; } @@ -1899,10 +1955,10 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) goto unlock; newmnt->mnt.mnt_flags = mnt_flags; - err = graft_tree(newmnt, path); + err = graft_tree(newmnt, parent, mp); unlock: - unlock_mount(path); + unlock_mount(mp); return err; } @@ -1975,11 +2031,11 @@ int finish_automount(struct vfsmount *m, struct path *path) fail: /* remove m from any expiration list it may be on */ if (!list_empty(&mnt->mnt_expire)) { - down_write(&namespace_sem); + namespace_lock(); br_write_lock(&vfsmount_lock); list_del_init(&mnt->mnt_expire); br_write_unlock(&vfsmount_lock); - up_write(&namespace_sem); + namespace_unlock(); } mntput(m); mntput(m); @@ -1993,13 +2049,13 @@ fail: */ void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) { - down_write(&namespace_sem); + namespace_lock(); br_write_lock(&vfsmount_lock); list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); br_write_unlock(&vfsmount_lock); - up_write(&namespace_sem); + namespace_unlock(); } EXPORT_SYMBOL(mnt_set_expiry); @@ -2012,12 +2068,11 @@ void mark_mounts_for_expiry(struct list_head *mounts) { struct mount *mnt, *next; LIST_HEAD(graveyard); - LIST_HEAD(umounts); if (list_empty(mounts)) return; - down_write(&namespace_sem); + namespace_lock(); br_write_lock(&vfsmount_lock); /* extract from the expiration list every vfsmount that matches the @@ -2035,12 +2090,10 @@ void mark_mounts_for_expiry(struct list_head *mounts) while (!list_empty(&graveyard)) { mnt = list_first_entry(&graveyard, struct mount, mnt_expire); touch_mnt_namespace(mnt->mnt_ns); - umount_tree(mnt, 1, &umounts); + umount_tree(mnt, 1); } br_write_unlock(&vfsmount_lock); - up_write(&namespace_sem); - - release_mounts(&umounts); + namespace_unlock(); } EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); @@ -2097,7 +2150,7 @@ resume: * * vfsmount_lock must be held for write */ -static void shrink_submounts(struct mount *mnt, struct list_head *umounts) +static void shrink_submounts(struct mount *mnt) { LIST_HEAD(graveyard); struct mount *m; @@ -2108,7 +2161,7 @@ static void shrink_submounts(struct mount *mnt, struct list_head *umounts) m = list_first_entry(&graveyard, struct mount, mnt_expire); touch_mnt_namespace(m->mnt_ns); - umount_tree(m, 1, umounts); + umount_tree(m, 1); } } } @@ -2231,12 +2284,11 @@ long do_mount(const char *dev_name, const char *dir_name, retval = security_sb_mount(dev_name, &path, type_page, flags, data_page); + if (!retval && !may_mount()) + retval = -EPERM; if (retval) goto dput_out; - if (!may_mount()) - return -EPERM; - /* Default to relatime unless overriden */ if (!(flags & MS_NOATIME)) mnt_flags |= MNT_RELATIME; @@ -2335,14 +2387,14 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, if (IS_ERR(new_ns)) return new_ns; - down_write(&namespace_sem); + namespace_lock(); /* First pass: copy the tree topology */ copy_flags = CL_COPY_ALL | CL_EXPIRE; if (user_ns != mnt_ns->user_ns) - copy_flags |= CL_SHARED_TO_SLAVE; + copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED; new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { - up_write(&namespace_sem); + namespace_unlock(); free_mnt_ns(new_ns); return ERR_CAST(new); } @@ -2373,7 +2425,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, p = next_mnt(p, old); q = next_mnt(q, new); } - up_write(&namespace_sem); + namespace_unlock(); if (rootmnt) mntput(rootmnt); @@ -2411,7 +2463,7 @@ static struct mnt_namespace *create_mnt_ns(struct vfsmount *m) struct mount *mnt = real_mount(m); mnt->mnt_ns = new_ns; new_ns->root = mnt; - list_add(&new_ns->list, &mnt->mnt_list); + list_add(&mnt->mnt_list, &new_ns->list); } else { mntput(m); } @@ -2543,7 +2595,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, const char __user *, put_old) { struct path new, old, parent_path, root_parent, root; - struct mount *new_mnt, *root_mnt; + struct mount *new_mnt, *root_mnt, *old_mnt; + struct mountpoint *old_mp, *root_mp; int error; if (!may_mount()) @@ -2562,14 +2615,16 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, goto out2; get_fs_root(current->fs, &root); - error = lock_mount(&old); - if (error) + old_mp = lock_mount(&old); + error = PTR_ERR(old_mp); + if (IS_ERR(old_mp)) goto out3; error = -EINVAL; new_mnt = real_mount(new.mnt); root_mnt = real_mount(root.mnt); - if (IS_MNT_SHARED(real_mount(old.mnt)) || + old_mnt = real_mount(old.mnt); + if (IS_MNT_SHARED(old_mnt) || IS_MNT_SHARED(new_mnt->mnt_parent) || IS_MNT_SHARED(root_mnt->mnt_parent)) goto out4; @@ -2578,37 +2633,37 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, error = -ENOENT; if (d_unlinked(new.dentry)) goto out4; - if (d_unlinked(old.dentry)) - goto out4; error = -EBUSY; - if (new.mnt == root.mnt || - old.mnt == root.mnt) + if (new_mnt == root_mnt || old_mnt == root_mnt) goto out4; /* loop, on the same file system */ error = -EINVAL; if (root.mnt->mnt_root != root.dentry) goto out4; /* not a mountpoint */ if (!mnt_has_parent(root_mnt)) goto out4; /* not attached */ + root_mp = root_mnt->mnt_mp; if (new.mnt->mnt_root != new.dentry) goto out4; /* not a mountpoint */ if (!mnt_has_parent(new_mnt)) goto out4; /* not attached */ /* make sure we can reach put_old from new_root */ - if (!is_path_reachable(real_mount(old.mnt), old.dentry, &new)) + if (!is_path_reachable(old_mnt, old.dentry, &new)) goto out4; + root_mp->m_count++; /* pin it so it won't go away */ br_write_lock(&vfsmount_lock); detach_mnt(new_mnt, &parent_path); detach_mnt(root_mnt, &root_parent); /* mount old root on put_old */ - attach_mnt(root_mnt, &old); + attach_mnt(root_mnt, old_mnt, old_mp); /* mount new_root on / */ - attach_mnt(new_mnt, &root_parent); + attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp); touch_mnt_namespace(current->nsproxy->mnt_ns); br_write_unlock(&vfsmount_lock); chroot_fs_refs(&root, &new); + put_mountpoint(root_mp); error = 0; out4: - unlock_mount(&old); + unlock_mount(old_mp); if (!error) { path_put(&root_parent); path_put(&parent_path); @@ -2663,14 +2718,17 @@ void __init mnt_init(void) 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); + mountpoint_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); - if (!mount_hashtable) + if (!mount_hashtable || !mountpoint_hashtable) panic("Failed to allocate mount hash table\n"); printk(KERN_INFO "Mount-cache hash table entries: %lu\n", HASH_SIZE); for (u = 0; u < HASH_SIZE; u++) INIT_LIST_HEAD(&mount_hashtable[u]); + for (u = 0; u < HASH_SIZE; u++) + INIT_LIST_HEAD(&mountpoint_hashtable[u]); br_lock_init(&vfsmount_lock); @@ -2687,16 +2745,13 @@ void __init mnt_init(void) void put_mnt_ns(struct mnt_namespace *ns) { - LIST_HEAD(umount_list); - if (!atomic_dec_and_test(&ns->count)) return; - down_write(&namespace_sem); + namespace_lock(); br_write_lock(&vfsmount_lock); - umount_tree(ns->root, 0, &umount_list); + umount_tree(ns->root, 0); br_write_unlock(&vfsmount_lock); - up_write(&namespace_sem); - release_mounts(&umount_list); + namespace_unlock(); free_mnt_ns(ns); } @@ -2732,6 +2787,51 @@ bool our_mnt(struct vfsmount *mnt) return check_mnt(real_mount(mnt)); } +bool current_chrooted(void) +{ + /* Does the current process have a non-standard root */ + struct path ns_root; + struct path fs_root; + bool chrooted; + + /* Find the namespace root */ + ns_root.mnt = ¤t->nsproxy->mnt_ns->root->mnt; + ns_root.dentry = ns_root.mnt->mnt_root; + path_get(&ns_root); + while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root)) + ; + + get_fs_root(current->fs, &fs_root); + + chrooted = !path_equal(&fs_root, &ns_root); + + path_put(&fs_root); + path_put(&ns_root); + + return chrooted; +} + +void update_mnt_policy(struct user_namespace *userns) +{ + struct mnt_namespace *ns = current->nsproxy->mnt_ns; + struct mount *mnt; + + down_read(&namespace_sem); + list_for_each_entry(mnt, &ns->list, mnt_list) { + switch (mnt->mnt.mnt_sb->s_magic) { + case SYSFS_MAGIC: + userns->may_mount_sysfs = true; + break; + case PROC_SUPER_MAGIC: + userns->may_mount_proc = true; + break; + } + if (userns->may_mount_sysfs && userns->may_mount_proc) + break; + } + up_read(&namespace_sem); +} + static void *mntns_get(struct task_struct *task) { struct mnt_namespace *ns = NULL; diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 7dafd6899a62..26910c8154da 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -1051,6 +1051,7 @@ static struct file_system_type ncp_fs_type = { .kill_sb = kill_anon_super, .fs_flags = FS_BINARY_MOUNTDATA, }; +MODULE_ALIAS_FS("ncpfs"); static int __init init_ncp_fs(void) { diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index f4891bde8851..8485978993e8 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -173,7 +173,7 @@ struct bl_msg_hdr { /* blocklayoutdev.c */ ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); void bl_pipe_destroy_msg(struct rpc_pipe_msg *); -int nfs4_blkdev_put(struct block_device *bdev); +void nfs4_blkdev_put(struct block_device *bdev); struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server, struct pnfs_device *dev); int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c index a86c5bdad9e3..04303b5c9361 100644 --- a/fs/nfs/blocklayout/blocklayoutdev.c +++ b/fs/nfs/blocklayout/blocklayoutdev.c @@ -56,11 +56,11 @@ static int decode_sector_number(__be32 **rp, sector_t *sp) /* * Release the block device */ -int nfs4_blkdev_put(struct block_device *bdev) +void nfs4_blkdev_put(struct block_device *bdev) { dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); - return blkdev_put(bdev, FMODE_READ); + blkdev_put(bdev, FMODE_READ); } ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c index 737d839bc17b..8999cfddd866 100644 --- a/fs/nfs/blocklayout/blocklayoutdm.c +++ b/fs/nfs/blocklayout/blocklayoutdm.c @@ -55,7 +55,8 @@ static void dev_remove(struct net *net, dev_t dev) bl_pipe_msg.bl_wq = &nn->bl_wq; memset(msg, 0, sizeof(*msg)); - msg->data = kzalloc(1 + sizeof(bl_umount_request), GFP_NOFS); + msg->len = sizeof(bl_msg) + bl_msg.totallen; + msg->data = kzalloc(msg->len, GFP_NOFS); if (!msg->data) goto out; @@ -66,7 +67,6 @@ static void dev_remove(struct net *net, dev_t dev) memcpy(msg->data, &bl_msg, sizeof(bl_msg)); dataptr = (uint8_t *) msg->data; memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request)); - msg->len = sizeof(bl_msg) + bl_msg.totallen; add_wait_queue(&nn->bl_wq, &wq); if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) { @@ -88,14 +88,8 @@ out: */ static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev) { - int rv; - dprintk("%s Releasing\n", __func__); - rv = nfs4_blkdev_put(bdev->bm_mdev); - if (rv) - printk(KERN_ERR "NFS: %s nfs4_blkdev_put returns %d\n", - __func__, rv); - + nfs4_blkdev_put(bdev->bm_mdev); dev_remove(bdev->net, bdev->bm_mdev->bd_dev); } diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 5088b57b078a..cff089a412c7 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -125,6 +125,9 @@ nfs41_callback_svc(void *vrqstp) set_freezable(); while (!kthread_should_stop()) { + if (try_to_freeze()) + continue; + prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); spin_lock_bh(&serv->sv_cb_lock); if (!list_empty(&serv->sv_cb_list)) { diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 2960512792c2..a13d26ede254 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -500,7 +500,7 @@ __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy, &args->craa_type_mask)) pnfs_recall_all_layouts(cps->clp); if (flags) - nfs_expire_all_delegation_types(cps->clp, flags); + nfs_expire_unused_delegation_types(cps->clp, flags); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); return status; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 84d8eae203a7..c513b0cc835f 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -593,6 +593,8 @@ int nfs_create_rpc_client(struct nfs_client *clp, args.flags |= RPC_CLNT_CREATE_DISCRTRY; if (test_bit(NFS_CS_NORESVPORT, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; + if (test_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags)) + args.flags |= RPC_CLNT_CREATE_INFINITE_SLOTS; if (!IS_ERR(clp->cl_rpcclient)) return 0; diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 6390a4b5fee7..57db3244f4d9 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -64,17 +64,15 @@ int nfs4_have_delegation(struct inode *inode, fmode_t flags) return ret; } -static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state) +static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid) { struct inode *inode = state->inode; struct file_lock *fl; int status = 0; if (inode->i_flock == NULL) - return 0; - - if (inode->i_flock == NULL) goto out; + /* Protect inode->i_flock using the file locks lock */ lock_flocks(); for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { @@ -83,7 +81,7 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_ if (nfs_file_open_context(fl->fl_file) != ctx) continue; unlock_flocks(); - status = nfs4_lock_delegation_recall(state, fl); + status = nfs4_lock_delegation_recall(fl, state, stateid); if (status < 0) goto out; lock_flocks(); @@ -120,7 +118,7 @@ again: seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); err = nfs4_open_delegation_recall(ctx, state, stateid); if (!err) - err = nfs_delegation_claim_locks(ctx, state); + err = nfs_delegation_claim_locks(ctx, state, stateid); if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) err = -EAGAIN; mutex_unlock(&sp->so_delegreturn_mutex); @@ -389,6 +387,24 @@ out: return err; } +static bool nfs_delegation_need_return(struct nfs_delegation *delegation) +{ + bool ret = false; + + if (test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags)) + ret = true; + if (test_and_clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags) && !ret) { + struct inode *inode; + + spin_lock(&delegation->lock); + inode = delegation->inode; + if (inode && list_empty(&NFS_I(inode)->open_files)) + ret = true; + spin_unlock(&delegation->lock); + } + return ret; +} + /** * nfs_client_return_marked_delegations - return previously marked delegations * @clp: nfs_client to process @@ -411,8 +427,7 @@ restart: list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { list_for_each_entry_rcu(delegation, &server->delegations, super_list) { - if (!test_and_clear_bit(NFS_DELEGATION_RETURN, - &delegation->flags)) + if (!nfs_delegation_need_return(delegation)) continue; inode = nfs_delegation_grab_inode(delegation); if (inode == NULL) @@ -471,6 +486,13 @@ int nfs4_inode_return_delegation(struct inode *inode) return err; } +static void nfs_mark_return_if_closed_delegation(struct nfs_server *server, + struct nfs_delegation *delegation) +{ + set_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags); + set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); +} + static void nfs_mark_return_delegation(struct nfs_server *server, struct nfs_delegation *delegation) { @@ -478,6 +500,45 @@ static void nfs_mark_return_delegation(struct nfs_server *server, set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); } +static bool nfs_server_mark_return_all_delegations(struct nfs_server *server) +{ + struct nfs_delegation *delegation; + bool ret = false; + + list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + nfs_mark_return_delegation(server, delegation); + ret = true; + } + return ret; +} + +static void nfs_client_mark_return_all_delegations(struct nfs_client *clp) +{ + struct nfs_server *server; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + nfs_server_mark_return_all_delegations(server); + rcu_read_unlock(); +} + +static void nfs_delegation_run_state_manager(struct nfs_client *clp) +{ + if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) + nfs4_schedule_state_manager(clp); +} + +/** + * nfs_expire_all_delegations + * @clp: client to process + * + */ +void nfs_expire_all_delegations(struct nfs_client *clp) +{ + nfs_client_mark_return_all_delegations(clp); + nfs_delegation_run_state_manager(clp); +} + /** * nfs_super_return_all_delegations - return delegations for one superblock * @sb: sb to process @@ -486,24 +547,22 @@ static void nfs_mark_return_delegation(struct nfs_server *server, void nfs_server_return_all_delegations(struct nfs_server *server) { struct nfs_client *clp = server->nfs_client; - struct nfs_delegation *delegation; + bool need_wait; if (clp == NULL) return; rcu_read_lock(); - list_for_each_entry_rcu(delegation, &server->delegations, super_list) { - spin_lock(&delegation->lock); - set_bit(NFS_DELEGATION_RETURN, &delegation->flags); - spin_unlock(&delegation->lock); - } + need_wait = nfs_server_mark_return_all_delegations(server); rcu_read_unlock(); - if (nfs_client_return_marked_delegations(clp) != 0) + if (need_wait) { nfs4_schedule_state_manager(clp); + nfs4_wait_clnt_recover(clp); + } } -static void nfs_mark_return_all_delegation_types(struct nfs_server *server, +static void nfs_mark_return_unused_delegation_types(struct nfs_server *server, fmode_t flags) { struct nfs_delegation *delegation; @@ -512,27 +571,21 @@ static void nfs_mark_return_all_delegation_types(struct nfs_server *server, if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE)) continue; if (delegation->type & flags) - nfs_mark_return_delegation(server, delegation); + nfs_mark_return_if_closed_delegation(server, delegation); } } -static void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp, +static void nfs_client_mark_return_unused_delegation_types(struct nfs_client *clp, fmode_t flags) { struct nfs_server *server; rcu_read_lock(); list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) - nfs_mark_return_all_delegation_types(server, flags); + nfs_mark_return_unused_delegation_types(server, flags); rcu_read_unlock(); } -static void nfs_delegation_run_state_manager(struct nfs_client *clp) -{ - if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) - nfs4_schedule_state_manager(clp); -} - void nfs_remove_bad_delegation(struct inode *inode) { struct nfs_delegation *delegation; @@ -546,27 +599,17 @@ void nfs_remove_bad_delegation(struct inode *inode) EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation); /** - * nfs_expire_all_delegation_types + * nfs_expire_unused_delegation_types * @clp: client to process * @flags: delegation types to expire * */ -void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags) +void nfs_expire_unused_delegation_types(struct nfs_client *clp, fmode_t flags) { - nfs_client_mark_return_all_delegation_types(clp, flags); + nfs_client_mark_return_unused_delegation_types(clp, flags); nfs_delegation_run_state_manager(clp); } -/** - * nfs_expire_all_delegations - * @clp: client to process - * - */ -void nfs_expire_all_delegations(struct nfs_client *clp) -{ - nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE); -} - static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server) { struct nfs_delegation *delegation; @@ -574,7 +617,7 @@ static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server) list_for_each_entry_rcu(delegation, &server->delegations, super_list) { if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags)) continue; - nfs_mark_return_delegation(server, delegation); + nfs_mark_return_if_closed_delegation(server, delegation); } } diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index d54d4fca6793..9a79c7a99d6d 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -28,6 +28,7 @@ struct nfs_delegation { enum { NFS_DELEGATION_NEED_RECLAIM = 0, NFS_DELEGATION_RETURN, + NFS_DELEGATION_RETURN_IF_CLOSED, NFS_DELEGATION_REFERENCED, NFS_DELEGATION_RETURNING, }; @@ -41,7 +42,7 @@ void nfs_inode_return_delegation_noreclaim(struct inode *inode); struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); void nfs_server_return_all_delegations(struct nfs_server *); void nfs_expire_all_delegations(struct nfs_client *clp); -void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags); +void nfs_expire_unused_delegation_types(struct nfs_client *clp, fmode_t flags); void nfs_expire_unreferenced_delegations(struct nfs_client *clp); int nfs_client_return_marked_delegations(struct nfs_client *clp); int nfs_delegations_present(struct nfs_client *clp); @@ -53,7 +54,7 @@ void nfs_delegation_reap_unclaimed(struct nfs_client *clp); /* NFSv4 delegation-related procedures */ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync); int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid); -int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl); +int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid); bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_t flags); void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index f23f455be42b..e093e73178b7 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1486,6 +1486,8 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) goto no_open; if (d_mountpoint(dentry)) goto no_open; + if (NFS_SB(dentry->d_sb)->caps & NFS_CAP_ATOMIC_OPEN_V1) + goto no_open; inode = dentry->d_inode; parent = dget_parent(dentry); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 29f4a48a0ee6..a87a44f84113 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -744,6 +744,7 @@ static int do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) { struct inode *inode = filp->f_mapping->host; + struct nfs_lock_context *l_ctx; int status; /* @@ -752,6 +753,14 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) */ nfs_sync_mapping(filp->f_mapping); + l_ctx = nfs_get_lock_context(nfs_file_open_context(filp)); + if (!IS_ERR(l_ctx)) { + status = nfs_iocounter_wait(&l_ctx->io_count); + nfs_put_lock_context(l_ctx); + if (status < 0) + return status; + } + /* NOTE: special case * If we're signalled while cleaning up locks on process exit, we * still need to complete the unlock. diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index dc0f98dfa717..c516da5873fd 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -726,9 +726,9 @@ out1: return ret; } -static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *data) +static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *data, size_t datalen) { - return key_instantiate_and_link(key, data, strlen(data) + 1, + return key_instantiate_and_link(key, data, datalen, id_resolver_cache->thread_keyring, authkey); } @@ -738,6 +738,7 @@ static int nfs_idmap_read_and_verify_message(struct idmap_msg *im, struct key *key, struct key *authkey) { char id_str[NFS_UINT_MAXLEN]; + size_t len; int ret = -ENOKEY; /* ret = -ENOKEY */ @@ -747,13 +748,15 @@ static int nfs_idmap_read_and_verify_message(struct idmap_msg *im, case IDMAP_CONV_NAMETOID: if (strcmp(upcall->im_name, im->im_name) != 0) break; - sprintf(id_str, "%d", im->im_id); - ret = nfs_idmap_instantiate(key, authkey, id_str); + /* Note: here we store the NUL terminator too */ + len = sprintf(id_str, "%d", im->im_id) + 1; + ret = nfs_idmap_instantiate(key, authkey, id_str, len); break; case IDMAP_CONV_IDTONAME: if (upcall->im_id != im->im_id) break; - ret = nfs_idmap_instantiate(key, authkey, im->im_name); + len = strlen(im->im_name); + ret = nfs_idmap_instantiate(key, authkey, im->im_name, len); break; default: ret = -EINVAL; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 1f941674b089..c1c7a9d78722 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -561,20 +561,22 @@ static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) l_ctx->lockowner.l_owner = current->files; l_ctx->lockowner.l_pid = current->tgid; INIT_LIST_HEAD(&l_ctx->list); + nfs_iocounter_init(&l_ctx->io_count); } static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx) { - struct nfs_lock_context *pos; + struct nfs_lock_context *head = &ctx->lock_context; + struct nfs_lock_context *pos = head; - list_for_each_entry(pos, &ctx->lock_context.list, list) { + do { if (pos->lockowner.l_owner != current->files) continue; if (pos->lockowner.l_pid != current->tgid) continue; atomic_inc(&pos->count); return pos; - } + } while ((pos = list_entry(pos->list.next, typeof(*pos), list)) != head); return NULL; } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 541c9ebdbc5a..91e59a39fc08 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -229,6 +229,13 @@ extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr, void (*release)(struct nfs_pgio_header *hdr)); void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos); +int nfs_iocounter_wait(struct nfs_io_counter *c); + +static inline void nfs_iocounter_init(struct nfs_io_counter *c) +{ + c->flags = 0; + atomic_set(&c->io_count, 0); +} /* nfs2xdr.c */ extern struct rpc_procinfo nfs_procedures[]; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 944c9a5c1039..a1dd768d0a35 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -36,6 +36,7 @@ enum nfs4_client_state { struct nfs4_minor_version_ops { u32 minor_version; + unsigned init_caps; int (*call_sync)(struct rpc_clnt *clnt, struct nfs_server *server, @@ -46,6 +47,8 @@ struct nfs4_minor_version_ops { const nfs4_stateid *); int (*find_root_sec)(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); + int (*free_lock_state)(struct nfs_server *, + struct nfs4_lock_state *); const struct nfs4_state_recovery_ops *reboot_recovery_ops; const struct nfs4_state_recovery_ops *nograce_recovery_ops; const struct nfs4_state_maintenance_ops *state_renewal_ops; @@ -143,12 +146,14 @@ struct nfs4_lock_state { enum { LK_STATE_IN_USE, NFS_DELEGATED_STATE, /* Current stateid is delegation */ + NFS_OPEN_STATE, /* OPEN stateid is set */ NFS_O_RDONLY_STATE, /* OPEN stateid has read-only state */ NFS_O_WRONLY_STATE, /* OPEN stateid has write-only state */ NFS_O_RDWR_STATE, /* OPEN stateid has read/write state */ NFS_STATE_RECLAIM_REBOOT, /* OPEN stateid server rebooted */ NFS_STATE_RECLAIM_NOGRACE, /* OPEN stateid needs to recover state */ NFS_STATE_POSIX_LOCKS, /* Posix locks are supported */ + NFS_STATE_RECOVERY_FAILED, /* OPEN stateid state recovery failed */ }; struct nfs4_state { @@ -231,8 +236,11 @@ extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struc extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, struct qstr *, struct nfs_fh *, struct nfs_fattr *); extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *); -extern int nfs4_release_lockowner(struct nfs4_lock_state *); extern const struct xattr_handler *nfs4_xattr_handlers[]; +extern int nfs4_set_rw_stateid(nfs4_stateid *stateid, + const struct nfs_open_context *ctx, + const struct nfs_lock_context *l_ctx, + fmode_t fmode); #if defined(CONFIG_NFS_V4_1) static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) @@ -347,13 +355,13 @@ extern int nfs4_wait_clnt_recover(struct nfs_client *clp); extern int nfs4_client_recover_expired_lease(struct nfs_client *clp); extern void nfs4_schedule_state_manager(struct nfs_client *); extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp); -extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); +extern int nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); extern void nfs41_handle_server_scope(struct nfs_client *, struct nfs41_server_scope **); extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); -extern void nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *, +extern int nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *, fmode_t, const struct nfs_lockowner *); extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); @@ -412,6 +420,11 @@ static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_statei return memcmp(dst, src, sizeof(*dst)) == 0; } +static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state) +{ + return test_bit(NFS_STATE_RECOVERY_FAILED, &state->flags) == 0; +} + #else #define nfs4_close_state(a, b) do { } while (0) diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index ac4fc9a8fdbc..947b0c908aa9 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -198,8 +198,12 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, /* Check NFS protocol revision and initialize RPC op vector */ clp->rpc_ops = &nfs_v4_clientops; + if (clp->cl_minorversion != 0) + __set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags); __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); - error = nfs_create_rpc_client(clp, timeparms, authflavour); + error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I); + if (error == -EINVAL) + error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_NULL); if (error < 0) goto error; @@ -300,7 +304,7 @@ int nfs40_walk_client_list(struct nfs_client *new, struct rpc_cred *cred) { struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id); - struct nfs_client *pos, *n, *prev = NULL; + struct nfs_client *pos, *prev = NULL; struct nfs4_setclientid_res clid = { .clientid = new->cl_clientid, .confirm = new->cl_confirm, @@ -308,10 +312,23 @@ int nfs40_walk_client_list(struct nfs_client *new, int status = -NFS4ERR_STALE_CLIENTID; spin_lock(&nn->nfs_client_lock); - list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) { + list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { /* If "pos" isn't marked ready, we can't trust the * remaining fields in "pos" */ - if (pos->cl_cons_state < NFS_CS_READY) + if (pos->cl_cons_state > NFS_CS_READY) { + atomic_inc(&pos->cl_count); + spin_unlock(&nn->nfs_client_lock); + + if (prev) + nfs_put_client(prev); + prev = pos; + + status = nfs_wait_client_init_complete(pos); + spin_lock(&nn->nfs_client_lock); + if (status < 0) + continue; + } + if (pos->cl_cons_state != NFS_CS_READY) continue; if (pos->rpc_ops != new->rpc_ops) @@ -423,16 +440,16 @@ int nfs41_walk_client_list(struct nfs_client *new, struct rpc_cred *cred) { struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id); - struct nfs_client *pos, *n, *prev = NULL; + struct nfs_client *pos, *prev = NULL; int status = -NFS4ERR_STALE_CLIENTID; spin_lock(&nn->nfs_client_lock); - list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) { + list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { /* If "pos" isn't marked ready, we can't trust the * remaining fields in "pos", especially the client * ID and serverowner fields. Wait for CREATE_SESSION * to finish. */ - if (pos->cl_cons_state < NFS_CS_READY) { + if (pos->cl_cons_state > NFS_CS_READY) { atomic_inc(&pos->cl_count); spin_unlock(&nn->nfs_client_lock); @@ -440,18 +457,17 @@ int nfs41_walk_client_list(struct nfs_client *new, nfs_put_client(prev); prev = pos; - nfs4_schedule_lease_recovery(pos); status = nfs_wait_client_init_complete(pos); - if (status < 0) { - nfs_put_client(pos); - spin_lock(&nn->nfs_client_lock); - continue; + if (status == 0) { + nfs4_schedule_lease_recovery(pos); + status = nfs4_wait_clnt_recover(pos); } - status = pos->cl_cons_state; spin_lock(&nn->nfs_client_lock); if (status < 0) continue; } + if (pos->cl_cons_state != NFS_CS_READY) + continue; if (pos->rpc_ops != new->rpc_ops) continue; @@ -469,17 +485,18 @@ int nfs41_walk_client_list(struct nfs_client *new, continue; atomic_inc(&pos->cl_count); - spin_unlock(&nn->nfs_client_lock); + *result = pos; + status = 0; dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", __func__, pos, atomic_read(&pos->cl_count)); - - *result = pos; - return 0; + break; } /* No matching nfs_client found. */ spin_unlock(&nn->nfs_client_lock); dprintk("NFS: <-- %s status = %d\n", __func__, status); + if (prev) + nfs_put_client(prev); return status; } #endif /* CONFIG_NFS_V4_1 */ @@ -717,6 +734,19 @@ static int nfs4_server_common_setup(struct nfs_server *server, if (error < 0) goto out; + /* Set the basic capabilities */ + server->caps |= server->nfs_client->cl_mvops->init_caps; + if (server->flags & NFS_MOUNT_NORDIRPLUS) + server->caps &= ~NFS_CAP_READDIRPLUS; + /* + * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower + * authentication. + */ + if (nfs4_disable_idmapping && + server->client->cl_auth->au_flavor == RPC_AUTH_UNIX) + server->caps |= NFS_CAP_UIDGID_NOMAP; + + /* Probe the root fh to retrieve its FSID and filehandle */ error = nfs4_get_rootfh(server, mntfh); if (error < 0) @@ -760,9 +790,6 @@ static int nfs4_init_server(struct nfs_server *server, /* Initialise the client representation from the mount data */ server->flags = data->flags; - server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|NFS_CAP_POSIX_LOCK; - if (!(data->flags & NFS_MOUNT_NORDIRPLUS)) - server->caps |= NFS_CAP_READDIRPLUS; server->options = data->options; /* Get a client record */ @@ -779,13 +806,6 @@ static int nfs4_init_server(struct nfs_server *server, if (error < 0) goto error; - /* - * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower - * authentication. - */ - if (nfs4_disable_idmapping && data->auth_flavors[0] == RPC_AUTH_UNIX) - server->caps |= NFS_CAP_UIDGID_NOMAP; - if (data->rsize) server->rsize = nfs_block_size(data->rsize, NULL); if (data->wsize) @@ -863,7 +883,6 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, /* Initialise the client representation from the parent server */ nfs_server_copy_userdata(server, parent_server); - server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR; /* Get a client representation. * Note: NFSv4 always uses TCP, */ diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 49eeb044c109..22d10623f5ee 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -129,7 +129,6 @@ static void filelayout_fenceme(struct inode *inode, struct pnfs_layout_hdr *lo) { if (!test_and_clear_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) return; - clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags); pnfs_return_layout(inode); } @@ -159,11 +158,14 @@ static int filelayout_async_handle_error(struct rpc_task *task, case -NFS4ERR_OPENMODE: if (state == NULL) break; - nfs4_schedule_stateid_recovery(mds_server, state); + if (nfs4_schedule_stateid_recovery(mds_server, state) < 0) + goto out_bad_stateid; goto wait_on_recovery; case -NFS4ERR_EXPIRED: - if (state != NULL) - nfs4_schedule_stateid_recovery(mds_server, state); + if (state != NULL) { + if (nfs4_schedule_stateid_recovery(mds_server, state) < 0) + goto out_bad_stateid; + } nfs4_schedule_lease_recovery(mds_client); goto wait_on_recovery; /* DS session errors */ @@ -227,6 +229,9 @@ reset: out: task->tk_status = 0; return -EAGAIN; +out_bad_stateid: + task->tk_status = -EIO; + return 0; wait_on_recovery: rpc_sleep_on(&mds_client->cl_rpcwaitq, task, NULL); if (test_bit(NFS4CLNT_MANAGER_RUNNING, &mds_client->cl_state) == 0) @@ -300,6 +305,10 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data) { struct nfs_read_data *rdata = data; + if (unlikely(test_bit(NFS_CONTEXT_BAD, &rdata->args.context->flags))) { + rpc_exit(task, -EIO); + return; + } if (filelayout_reset_to_mds(rdata->header->lseg)) { dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid); filelayout_reset_read(rdata); @@ -308,10 +317,13 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data) } rdata->read_done_cb = filelayout_read_done_cb; - nfs41_setup_sequence(rdata->ds_clp->cl_session, + if (nfs41_setup_sequence(rdata->ds_clp->cl_session, &rdata->args.seq_args, &rdata->res.seq_res, - task); + task)) + return; + nfs4_set_rw_stateid(&rdata->args.stateid, rdata->args.context, + rdata->args.lock_context, FMODE_READ); } static void filelayout_read_call_done(struct rpc_task *task, void *data) @@ -402,16 +414,23 @@ static void filelayout_write_prepare(struct rpc_task *task, void *data) { struct nfs_write_data *wdata = data; + if (unlikely(test_bit(NFS_CONTEXT_BAD, &wdata->args.context->flags))) { + rpc_exit(task, -EIO); + return; + } if (filelayout_reset_to_mds(wdata->header->lseg)) { dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid); filelayout_reset_write(wdata); rpc_exit(task, 0); return; } - nfs41_setup_sequence(wdata->ds_clp->cl_session, + if (nfs41_setup_sequence(wdata->ds_clp->cl_session, &wdata->args.seq_args, &wdata->res.seq_res, - task); + task)) + return; + nfs4_set_rw_stateid(&wdata->args.stateid, wdata->args.context, + wdata->args.lock_context, FMODE_WRITE); } static void filelayout_write_call_done(struct rpc_task *task, void *data) diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h index b8da95548d3d..235ff952d3c8 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/nfs4filelayout.h @@ -70,6 +70,8 @@ struct nfs4_pnfs_ds { struct list_head ds_addrs; struct nfs_client *ds_clp; atomic_t ds_count; + unsigned long ds_state; +#define NFS4DS_CONNECTING 0 /* ds is establishing connection */ }; struct nfs4_file_layout_dsaddr { diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index 1fe284f01f8b..661a0f611215 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -775,6 +775,22 @@ nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j) return flseg->fh_array[i]; } +static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds) +{ + might_sleep(); + wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING, + nfs_wait_bit_killable, TASK_KILLABLE); +} + +static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds) +{ + smp_mb__before_clear_bit(); + clear_bit(NFS4DS_CONNECTING, &ds->ds_state); + smp_mb__after_clear_bit(); + wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING); +} + + struct nfs4_pnfs_ds * nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) { @@ -791,16 +807,22 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) filelayout_mark_devid_invalid(devid); return NULL; } + if (ds->ds_clp) + return ds; - if (!ds->ds_clp) { + if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) { struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode); int err; err = nfs4_ds_connect(s, ds); if (err) { nfs4_mark_deviceid_unavailable(devid); - return NULL; + ds = NULL; } + nfs4_clear_ds_conn_bit(ds); + } else { + /* Either ds is connected, or ds is NULL */ + nfs4_wait_ds_connect(ds); } return ds; } diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 0dd766079e1c..cdb0b41a4810 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -134,33 +134,38 @@ static size_t nfs_parse_server_name(char *string, size_t len, return ret; } +/** + * nfs_find_best_sec - Find a security mechanism supported locally + * @flavors: List of security tuples returned by SECINFO procedure + * + * Return the pseudoflavor of the first security mechanism in + * "flavors" that is locally supported. Return RPC_AUTH_UNIX if + * no matching flavor is found in the array. The "flavors" array + * is searched in the order returned from the server, per RFC 3530 + * recommendation. + */ rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors) { - struct gss_api_mech *mech; - struct xdr_netobj oid; - int i; - rpc_authflavor_t pseudoflavor = RPC_AUTH_UNIX; + rpc_authflavor_t pseudoflavor; + struct nfs4_secinfo4 *secinfo; + unsigned int i; for (i = 0; i < flavors->num_flavors; i++) { - struct nfs4_secinfo_flavor *flavor; - flavor = &flavors->flavors[i]; - - if (flavor->flavor == RPC_AUTH_NULL || flavor->flavor == RPC_AUTH_UNIX) { - pseudoflavor = flavor->flavor; - break; - } else if (flavor->flavor == RPC_AUTH_GSS) { - oid.len = flavor->gss.sec_oid4.len; - oid.data = flavor->gss.sec_oid4.data; - mech = gss_mech_get_by_OID(&oid); - if (!mech) - continue; - pseudoflavor = gss_svc_to_pseudoflavor(mech, flavor->gss.service); - gss_mech_put(mech); + secinfo = &flavors->flavors[i]; + + switch (secinfo->flavor) { + case RPC_AUTH_NULL: + case RPC_AUTH_UNIX: + case RPC_AUTH_GSS: + pseudoflavor = rpcauth_get_pseudoflavor(secinfo->flavor, + &secinfo->flavor_info); + if (pseudoflavor != RPC_AUTH_MAXFLAVOR) + return pseudoflavor; break; } } - return pseudoflavor; + return RPC_AUTH_UNIX; } static rpc_authflavor_t nfs4_negotiate_security(struct inode *inode, struct qstr *name) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b2671cb0f901..8fbc10054115 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -107,6 +107,8 @@ static int nfs4_map_errors(int err) return -EPROTONOSUPPORT; case -NFS4ERR_ACCESS: return -EACCES; + case -NFS4ERR_FILE_OPEN: + return -EBUSY; default: dprintk("%s could not handle NFSv4 error %d\n", __func__, -err); @@ -295,19 +297,30 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc } if (state == NULL) break; - nfs4_schedule_stateid_recovery(server, state); + ret = nfs4_schedule_stateid_recovery(server, state); + if (ret < 0) + break; goto wait_on_recovery; case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: + if (inode != NULL && nfs4_have_delegation(inode, FMODE_READ)) { + nfs_remove_bad_delegation(inode); + exception->retry = 1; + break; + } if (state == NULL) break; - nfs_remove_bad_delegation(state->inode); - nfs4_schedule_stateid_recovery(server, state); + ret = nfs4_schedule_stateid_recovery(server, state); + if (ret < 0) + break; goto wait_on_recovery; case -NFS4ERR_EXPIRED: - if (state != NULL) - nfs4_schedule_stateid_recovery(server, state); + if (state != NULL) { + ret = nfs4_schedule_stateid_recovery(server, state); + if (ret < 0) + break; + } case -NFS4ERR_STALE_STATEID: case -NFS4ERR_STALE_CLIENTID: nfs4_schedule_lease_recovery(clp); @@ -756,10 +769,40 @@ struct nfs4_opendata { struct iattr attrs; unsigned long timestamp; unsigned int rpc_done : 1; + unsigned int is_recover : 1; int rpc_status; int cancelled; }; +static bool nfs4_clear_cap_atomic_open_v1(struct nfs_server *server, + int err, struct nfs4_exception *exception) +{ + if (err != -EINVAL) + return false; + if (!(server->caps & NFS_CAP_ATOMIC_OPEN_V1)) + return false; + server->caps &= ~NFS_CAP_ATOMIC_OPEN_V1; + exception->retry = 1; + return true; +} + +static enum open_claim_type4 +nfs4_map_atomic_open_claim(struct nfs_server *server, + enum open_claim_type4 claim) +{ + if (server->caps & NFS_CAP_ATOMIC_OPEN_V1) + return claim; + switch (claim) { + default: + return claim; + case NFS4_OPEN_CLAIM_FH: + return NFS4_OPEN_CLAIM_NULL; + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: + return NFS4_OPEN_CLAIM_DELEGATE_CUR; + case NFS4_OPEN_CLAIM_DELEG_PREV_FH: + return NFS4_OPEN_CLAIM_DELEGATE_PREV; + } +} static void nfs4_init_opendata_res(struct nfs4_opendata *p) { @@ -775,6 +818,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p) static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, struct nfs4_state_owner *sp, fmode_t fmode, int flags, const struct iattr *attrs, + enum open_claim_type4 claim, gfp_t gfp_mask) { struct dentry *parent = dget_parent(dentry); @@ -793,7 +837,6 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, p->dir = parent; p->owner = sp; atomic_inc(&sp->so_count); - p->o_arg.fh = NFS_FH(dir); p->o_arg.open_flags = flags; p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE); /* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS @@ -811,7 +854,19 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, p->o_arg.server = server; p->o_arg.bitmask = server->attr_bitmask; p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0]; - p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; + p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim); + switch (p->o_arg.claim) { + case NFS4_OPEN_CLAIM_NULL: + case NFS4_OPEN_CLAIM_DELEGATE_CUR: + case NFS4_OPEN_CLAIM_DELEGATE_PREV: + p->o_arg.fh = NFS_FH(dir); + break; + case NFS4_OPEN_CLAIM_PREVIOUS: + case NFS4_OPEN_CLAIM_FH: + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: + case NFS4_OPEN_CLAIM_DELEG_PREV_FH: + p->o_arg.fh = NFS_FH(dentry->d_inode); + } if (attrs != NULL && attrs->ia_valid != 0) { __be32 verf[2]; @@ -924,6 +979,7 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid * if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) nfs4_stateid_copy(&state->stateid, stateid); nfs4_stateid_copy(&state->open_stateid, stateid); + set_bit(NFS_OPEN_STATE, &state->flags); switch (fmode) { case FMODE_READ: set_bit(NFS_O_RDONLY_STATE, &state->flags); @@ -1046,9 +1102,12 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) /* Save the delegation */ nfs4_stateid_copy(&stateid, &delegation->stateid); rcu_read_unlock(); - ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode); - if (ret != 0) - goto out; + nfs_release_seqid(opendata->o_arg.seqid); + if (!opendata->is_recover) { + ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode); + if (ret != 0) + goto out; + } ret = -EAGAIN; /* Try to update the stateid using the delegation */ @@ -1193,11 +1252,13 @@ static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state * return ERR_PTR(-ENOENT); } -static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context *ctx, struct nfs4_state *state) +static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context *ctx, + struct nfs4_state *state, enum open_claim_type4 claim) { struct nfs4_opendata *opendata; - opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0, NULL, GFP_NOFS); + opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0, + NULL, claim, GFP_NOFS); if (opendata == NULL) return ERR_PTR(-ENOMEM); opendata->state = state; @@ -1233,6 +1294,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state * /* memory barrier prior to reading state->n_* */ clear_bit(NFS_DELEGATED_STATE, &state->flags); + clear_bit(NFS_OPEN_STATE, &state->flags); smp_rmb(); if (state->n_rdwr != 0) { clear_bit(NFS_O_RDWR_STATE, &state->flags); @@ -1283,11 +1345,10 @@ static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state fmode_t delegation_type = 0; int status; - opendata = nfs4_open_recoverdata_alloc(ctx, state); + opendata = nfs4_open_recoverdata_alloc(ctx, state, + NFS4_OPEN_CLAIM_PREVIOUS); if (IS_ERR(opendata)) return PTR_ERR(opendata); - opendata->o_arg.claim = NFS4_OPEN_CLAIM_PREVIOUS; - opendata->o_arg.fh = NFS_FH(state->inode); rcu_read_lock(); delegation = rcu_dereference(NFS_I(state->inode)->delegation); if (delegation != NULL && test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) != 0) @@ -1306,6 +1367,8 @@ static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state int err; do { err = _nfs4_do_open_reclaim(ctx, state); + if (nfs4_clear_cap_atomic_open_v1(server, err, &exception)) + continue; if (err != -NFS4ERR_DELAY) break; nfs4_handle_exception(server, err, &exception); @@ -1320,71 +1383,72 @@ static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *sta ctx = nfs4_state_find_open_context(state); if (IS_ERR(ctx)) - return PTR_ERR(ctx); + return -EAGAIN; ret = nfs4_do_open_reclaim(ctx, state); put_nfs_open_context(ctx); return ret; } -static int _nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid) +static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct nfs4_state *state, const nfs4_stateid *stateid, int err) { - struct nfs4_opendata *opendata; - int ret; - - opendata = nfs4_open_recoverdata_alloc(ctx, state); - if (IS_ERR(opendata)) - return PTR_ERR(opendata); - opendata->o_arg.claim = NFS4_OPEN_CLAIM_DELEGATE_CUR; - nfs4_stateid_copy(&opendata->o_arg.u.delegation, stateid); - ret = nfs4_open_recover(opendata, state); - nfs4_opendata_put(opendata); - return ret; + switch (err) { + default: + printk(KERN_ERR "NFS: %s: unhandled error " + "%d.\n", __func__, err); + case 0: + case -ENOENT: + case -ESTALE: + break; + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_DEADSESSION: + set_bit(NFS_DELEGATED_STATE, &state->flags); + nfs4_schedule_session_recovery(server->nfs_client->cl_session, err); + return -EAGAIN; + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_STALE_STATEID: + set_bit(NFS_DELEGATED_STATE, &state->flags); + case -NFS4ERR_EXPIRED: + /* Don't recall a delegation if it was lost */ + nfs4_schedule_lease_recovery(server->nfs_client); + return -EAGAIN; + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_OPENMODE: + nfs_inode_find_state_and_recover(state->inode, + stateid); + nfs4_schedule_stateid_recovery(server, state); + return 0; + case -NFS4ERR_DELAY: + case -NFS4ERR_GRACE: + set_bit(NFS_DELEGATED_STATE, &state->flags); + ssleep(1); + return -EAGAIN; + case -ENOMEM: + case -NFS4ERR_DENIED: + /* kill_proc(fl->fl_pid, SIGLOST, 1); */ + return 0; + } + return err; } int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid) { - struct nfs4_exception exception = { }; struct nfs_server *server = NFS_SERVER(state->inode); + struct nfs4_opendata *opendata; int err; - do { - err = _nfs4_open_delegation_recall(ctx, state, stateid); - switch (err) { - case 0: - case -ENOENT: - case -ESTALE: - goto out; - case -NFS4ERR_BADSESSION: - case -NFS4ERR_BADSLOT: - case -NFS4ERR_BAD_HIGH_SLOT: - case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: - case -NFS4ERR_DEADSESSION: - set_bit(NFS_DELEGATED_STATE, &state->flags); - nfs4_schedule_session_recovery(server->nfs_client->cl_session, err); - err = -EAGAIN; - goto out; - case -NFS4ERR_STALE_CLIENTID: - case -NFS4ERR_STALE_STATEID: - set_bit(NFS_DELEGATED_STATE, &state->flags); - case -NFS4ERR_EXPIRED: - /* Don't recall a delegation if it was lost */ - nfs4_schedule_lease_recovery(server->nfs_client); - err = -EAGAIN; - goto out; - case -NFS4ERR_DELEG_REVOKED: - case -NFS4ERR_ADMIN_REVOKED: - case -NFS4ERR_BAD_STATEID: - nfs_inode_find_state_and_recover(state->inode, - stateid); - nfs4_schedule_stateid_recovery(server, state); - case -ENOMEM: - err = 0; - goto out; - } - set_bit(NFS_DELEGATED_STATE, &state->flags); - err = nfs4_handle_exception(server, err, &exception); - } while (exception.retry); -out: - return err; + + opendata = nfs4_open_recoverdata_alloc(ctx, state, + NFS4_OPEN_CLAIM_DELEG_CUR_FH); + if (IS_ERR(opendata)) + return PTR_ERR(opendata); + nfs4_stateid_copy(&opendata->o_arg.u.delegation, stateid); + err = nfs4_open_recover(opendata, state); + nfs4_opendata_put(opendata); + return nfs4_handle_delegation_recall_error(server, state, stateid, err); } static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata) @@ -1467,6 +1531,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) { struct nfs4_opendata *data = calldata; struct nfs4_state_owner *sp = data->owner; + struct nfs_client *clp = sp->so_server->nfs_client; if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0) goto out_wait; @@ -1482,15 +1547,20 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) rcu_read_lock(); delegation = rcu_dereference(NFS_I(data->state->inode)->delegation); if (data->o_arg.claim != NFS4_OPEN_CLAIM_DELEGATE_CUR && + data->o_arg.claim != NFS4_OPEN_CLAIM_DELEG_CUR_FH && can_open_delegated(delegation, data->o_arg.fmode)) goto unlock_no_action; rcu_read_unlock(); } /* Update client id. */ - data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid; - if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) { - task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; + data->o_arg.clientid = clp->cl_clientid; + switch (data->o_arg.claim) { + case NFS4_OPEN_CLAIM_PREVIOUS: + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: + case NFS4_OPEN_CLAIM_DELEG_PREV_FH: data->o_arg.open_bitmap = &nfs4_open_noattr_bitmap[0]; + case NFS4_OPEN_CLAIM_FH: + task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; nfs_copy_fh(&data->o_res.fh, data->o_arg.fh); } data->timestamp = jiffies; @@ -1499,6 +1569,16 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) &data->o_res.seq_res, task) != 0) nfs_release_seqid(data->o_arg.seqid); + + /* Set the create mode (note dependency on the session type) */ + data->o_arg.createmode = NFS4_CREATE_UNCHECKED; + if (data->o_arg.open_flags & O_EXCL) { + data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE; + if (nfs4_has_persistent_session(clp)) + data->o_arg.createmode = NFS4_CREATE_GUARDED; + else if (clp->cl_mvops->minor_version > 0) + data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE4_1; + } return; unlock_no_action: rcu_read_unlock(); @@ -1594,8 +1674,11 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover) data->rpc_done = 0; data->rpc_status = 0; data->cancelled = 0; - if (isrecover) + data->is_recover = 0; + if (isrecover) { nfs4_set_sequence_privileged(&o_arg->seq_args); + data->is_recover = 1; + } task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); @@ -1720,7 +1803,8 @@ static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *s struct nfs4_opendata *opendata; int ret; - opendata = nfs4_open_recoverdata_alloc(ctx, state); + opendata = nfs4_open_recoverdata_alloc(ctx, state, + NFS4_OPEN_CLAIM_FH); if (IS_ERR(opendata)) return PTR_ERR(opendata); ret = nfs4_open_recover(opendata, state); @@ -1738,6 +1822,8 @@ static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state do { err = _nfs4_open_expired(ctx, state); + if (nfs4_clear_cap_atomic_open_v1(server, err, &exception)) + continue; switch (err) { default: goto out; @@ -1758,7 +1844,7 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta ctx = nfs4_state_find_open_context(state); if (IS_ERR(ctx)) - return PTR_ERR(ctx); + return -EAGAIN; ret = nfs4_do_open_expired(ctx, state); put_nfs_open_context(ctx); return ret; @@ -1820,6 +1906,7 @@ static int nfs41_check_open_stateid(struct nfs4_state *state) clear_bit(NFS_O_RDONLY_STATE, &state->flags); clear_bit(NFS_O_WRONLY_STATE, &state->flags); clear_bit(NFS_O_RDWR_STATE, &state->flags); + clear_bit(NFS_OPEN_STATE, &state->flags); } return status; } @@ -1880,10 +1967,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, if (ret != 0) goto out; - if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) { + if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) nfs4_schedule_stateid_recovery(server, state); - nfs4_wait_clnt_recover(server->nfs_client); - } *res = state; out: return ret; @@ -1905,6 +1990,7 @@ static int _nfs4_do_open(struct inode *dir, struct nfs4_state *state = NULL; struct nfs_server *server = NFS_SERVER(dir); struct nfs4_opendata *opendata; + enum open_claim_type4 claim = NFS4_OPEN_CLAIM_NULL; int status; /* Protect against reboot recovery conflicts */ @@ -1920,7 +2006,10 @@ static int _nfs4_do_open(struct inode *dir, if (dentry->d_inode != NULL) nfs4_return_incompatible_delegation(dentry->d_inode, fmode); status = -ENOMEM; - opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr, GFP_KERNEL); + if (dentry->d_inode) + claim = NFS4_OPEN_CLAIM_FH; + opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr, + claim, GFP_KERNEL); if (opendata == NULL) goto err_put_state_owner; @@ -1937,7 +2026,8 @@ static int _nfs4_do_open(struct inode *dir, if (status != 0) goto err_opendata_put; - if (opendata->o_arg.open_flags & O_EXCL) { + if ((opendata->o_arg.open_flags & O_EXCL) && + (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) { nfs4_exclusive_attrset(opendata, sattr); nfs_fattr_init(opendata->o_res.f_attr); @@ -1978,6 +2068,7 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct rpc_cred *cred, struct nfs4_threshold **ctx_th) { + struct nfs_server *server = NFS_SERVER(dir); struct nfs4_exception exception = { }; struct nfs4_state *res; int status; @@ -2021,7 +2112,9 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, exception.retry = 1; continue; } - res = ERR_PTR(nfs4_handle_exception(NFS_SERVER(dir), + if (nfs4_clear_cap_atomic_open_v1(server, status, &exception)) + continue; + res = ERR_PTR(nfs4_handle_exception(server, status, &exception)); } while (exception.retry); return res; @@ -2049,20 +2142,25 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, .rpc_cred = cred, }; unsigned long timestamp = jiffies; + fmode_t fmode; + bool truncate; int status; nfs_fattr_init(fattr); - if (state != NULL) { + /* Servers should only apply open mode checks for file size changes */ + truncate = (sattr->ia_valid & ATTR_SIZE) ? true : false; + fmode = truncate ? FMODE_WRITE : FMODE_READ; + + if (nfs4_copy_delegation_stateid(&arg.stateid, inode, fmode)) { + /* Use that stateid */ + } else if (truncate && state != NULL && nfs4_valid_open_stateid(state)) { struct nfs_lockowner lockowner = { .l_owner = current->files, .l_pid = current->tgid, }; nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE, &lockowner); - } else if (nfs4_copy_delegation_stateid(&arg.stateid, inode, - FMODE_WRITE)) { - /* Use that stateid */ } else nfs4_stateid_copy(&arg.stateid, &zero_stateid); @@ -2086,6 +2184,13 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, err = _nfs4_do_setattr(inode, cred, fattr, sattr, state); switch (err) { case -NFS4ERR_OPENMODE: + if (!(sattr->ia_valid & ATTR_SIZE)) { + pr_warn_once("NFSv4: server %s is incorrectly " + "applying open mode checks to " + "a SETATTR that is not " + "changing file size.\n", + server->nfs_client->cl_hostname); + } if (state && !(state->state & FMODE_WRITE)) { err = -EBADF; if (sattr->ia_valid & ATTR_OPEN) @@ -2129,11 +2234,19 @@ static void nfs4_close_clear_stateid_flags(struct nfs4_state *state, fmode_t fmode) { spin_lock(&state->owner->so_lock); - if (!(fmode & FMODE_READ)) + clear_bit(NFS_O_RDWR_STATE, &state->flags); + switch (fmode & (FMODE_READ|FMODE_WRITE)) { + case FMODE_WRITE: clear_bit(NFS_O_RDONLY_STATE, &state->flags); - if (!(fmode & FMODE_WRITE)) + break; + case FMODE_READ: clear_bit(NFS_O_WRONLY_STATE, &state->flags); - clear_bit(NFS_O_RDWR_STATE, &state->flags); + break; + case 0: + clear_bit(NFS_O_RDONLY_STATE, &state->flags); + clear_bit(NFS_O_WRONLY_STATE, &state->flags); + clear_bit(NFS_OPEN_STATE, &state->flags); + } spin_unlock(&state->owner->so_lock); } @@ -2201,6 +2314,8 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) calldata->arg.fmode &= ~FMODE_WRITE; } } + if (!nfs4_valid_open_stateid(state)) + call_close = 0; spin_unlock(&state->owner->so_lock); if (!call_close) { @@ -2211,8 +2326,10 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) if (calldata->arg.fmode == 0) { task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE]; if (calldata->roc && - pnfs_roc_drain(inode, &calldata->roc_barrier, task)) + pnfs_roc_drain(inode, &calldata->roc_barrier, task)) { + nfs_release_seqid(calldata->arg.seqid); goto out_wait; + } } nfs_fattr_init(calldata->res.fattr); @@ -2443,7 +2560,7 @@ static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandl auth = rpcauth_create(flavor, server->client); if (IS_ERR(auth)) { - ret = -EIO; + ret = -EACCES; goto out; } ret = nfs4_lookup_root(server, fhandle, info); @@ -2451,27 +2568,36 @@ out: return ret; } +/* + * Retry pseudoroot lookup with various security flavors. We do this when: + * + * NFSv4.0: the PUTROOTFH operation returns NFS4ERR_WRONGSEC + * NFSv4.1: the server does not support the SECINFO_NO_NAME operation + * + * Returns zero on success, or a negative NFS4ERR value, or a + * negative errno value. + */ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *info) { - int i, len, status = 0; - rpc_authflavor_t flav_array[NFS_MAX_SECFLAVORS]; - - len = rpcauth_list_flavors(flav_array, ARRAY_SIZE(flav_array)); - if (len < 0) - return len; - - for (i = 0; i < len; i++) { - /* AUTH_UNIX is the default flavor if none was specified, - * thus has already been tried. */ - if (flav_array[i] == RPC_AUTH_UNIX) - continue; + /* Per 3530bis 15.33.5 */ + static const rpc_authflavor_t flav_array[] = { + RPC_AUTH_GSS_KRB5P, + RPC_AUTH_GSS_KRB5I, + RPC_AUTH_GSS_KRB5, + RPC_AUTH_UNIX, /* courtesy */ + RPC_AUTH_NULL, + }; + int status = -EPERM; + size_t i; + for (i = 0; i < ARRAY_SIZE(flav_array); i++) { status = nfs4_lookup_root_sec(server, fhandle, info, flav_array[i]); if (status == -NFS4ERR_WRONGSEC || status == -EACCES) continue; break; } + /* * -EACCESS could mean that the user doesn't have correct permissions * to access the mount. It could also mean that we tried to mount @@ -2484,24 +2610,36 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, return status; } -/* - * get the file handle for the "/" directory on the server +static int nfs4_do_find_root_sec(struct nfs_server *server, + struct nfs_fh *fhandle, struct nfs_fsinfo *info) +{ + int mv = server->nfs_client->cl_minorversion; + return nfs_v4_minor_ops[mv]->find_root_sec(server, fhandle, info); +} + +/** + * nfs4_proc_get_rootfh - get file handle for server's pseudoroot + * @server: initialized nfs_server handle + * @fhandle: we fill in the pseudo-fs root file handle + * @info: we fill in an FSINFO struct + * + * Returns zero on success, or a negative errno. */ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *info) { - int minor_version = server->nfs_client->cl_minorversion; - int status = nfs4_lookup_root(server, fhandle, info); - if ((status == -NFS4ERR_WRONGSEC) && !(server->flags & NFS_MOUNT_SECFLAVOUR)) - /* - * A status of -NFS4ERR_WRONGSEC will be mapped to -EPERM - * by nfs4_map_errors() as this function exits. - */ - status = nfs_v4_minor_ops[minor_version]->find_root_sec(server, fhandle, info); + int status; + + status = nfs4_lookup_root(server, fhandle, info); + if ((status == -NFS4ERR_WRONGSEC) && + !(server->flags & NFS_MOUNT_SECFLAVOUR)) + status = nfs4_do_find_root_sec(server, fhandle, info); + if (status == 0) status = nfs4_server_capabilities(server, fhandle); if (status == 0) status = nfs4_do_fsinfo(server, fhandle, info); + return nfs4_map_errors(status); } @@ -2632,7 +2770,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, int status; if (pnfs_ld_layoutret_on_setattr(inode)) - pnfs_return_layout(inode); + pnfs_commit_and_return_layout(inode); nfs_fattr_init(fattr); @@ -3380,12 +3518,21 @@ static int _nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo) { struct nfs4_exception exception = { }; + unsigned long now = jiffies; int err; do { - err = nfs4_handle_exception(server, - _nfs4_do_fsinfo(server, fhandle, fsinfo), - &exception); + err = _nfs4_do_fsinfo(server, fhandle, fsinfo); + if (err == 0) { + struct nfs_client *clp = server->nfs_client; + + spin_lock(&clp->cl_lock); + clp->cl_lease_time = fsinfo->lease_time * HZ; + clp->cl_last_renewal = now; + spin_unlock(&clp->cl_lock); + break; + } + err = nfs4_handle_exception(server, err, &exception); } while (exception.retry); return err; } @@ -3445,6 +3592,46 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, return err; } +int nfs4_set_rw_stateid(nfs4_stateid *stateid, + const struct nfs_open_context *ctx, + const struct nfs_lock_context *l_ctx, + fmode_t fmode) +{ + const struct nfs_lockowner *lockowner = NULL; + + if (l_ctx != NULL) + lockowner = &l_ctx->lockowner; + return nfs4_select_rw_stateid(stateid, ctx->state, fmode, lockowner); +} +EXPORT_SYMBOL_GPL(nfs4_set_rw_stateid); + +static bool nfs4_stateid_is_current(nfs4_stateid *stateid, + const struct nfs_open_context *ctx, + const struct nfs_lock_context *l_ctx, + fmode_t fmode) +{ + nfs4_stateid current_stateid; + + if (nfs4_set_rw_stateid(¤t_stateid, ctx, l_ctx, fmode)) + return false; + return nfs4_stateid_match(stateid, ¤t_stateid); +} + +static bool nfs4_error_stateid_expired(int err) +{ + switch (err) { + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_OLD_STATEID: + case -NFS4ERR_OPENMODE: + case -NFS4ERR_EXPIRED: + return true; + } + return false; +} + void __nfs4_read_done_cb(struct nfs_read_data *data) { nfs_invalidate_atime(data->header->inode); @@ -3465,6 +3652,20 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data) return 0; } +static bool nfs4_read_stateid_changed(struct rpc_task *task, + struct nfs_readargs *args) +{ + + if (!nfs4_error_stateid_expired(task->tk_status) || + nfs4_stateid_is_current(&args->stateid, + args->context, + args->lock_context, + FMODE_READ)) + return false; + rpc_restart_call_prepare(task); + return true; +} + static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) { @@ -3472,7 +3673,8 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) if (!nfs4_sequence_done(task, &data->res.seq_res)) return -EAGAIN; - + if (nfs4_read_stateid_changed(task, &data->args)) + return -EAGAIN; return data->read_done_cb ? data->read_done_cb(task, data) : nfs4_read_done_cb(task, data); } @@ -3487,10 +3689,13 @@ static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) { - nfs4_setup_sequence(NFS_SERVER(data->header->inode), + if (nfs4_setup_sequence(NFS_SERVER(data->header->inode), &data->args.seq_args, &data->res.seq_res, - task); + task)) + return; + nfs4_set_rw_stateid(&data->args.stateid, data->args.context, + data->args.lock_context, FMODE_READ); } static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data) @@ -3508,10 +3713,26 @@ static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data return 0; } +static bool nfs4_write_stateid_changed(struct rpc_task *task, + struct nfs_writeargs *args) +{ + + if (!nfs4_error_stateid_expired(task->tk_status) || + nfs4_stateid_is_current(&args->stateid, + args->context, + args->lock_context, + FMODE_WRITE)) + return false; + rpc_restart_call_prepare(task); + return true; +} + static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) { if (!nfs4_sequence_done(task, &data->res.seq_res)) return -EAGAIN; + if (nfs4_write_stateid_changed(task, &data->args)) + return -EAGAIN; return data->write_done_cb ? data->write_done_cb(task, data) : nfs4_write_done_cb(task, data); } @@ -3551,10 +3772,13 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag static void nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) { - nfs4_setup_sequence(NFS_SERVER(data->header->inode), + if (nfs4_setup_sequence(NFS_SERVER(data->header->inode), &data->args.seq_args, &data->res.seq_res, - task); + task)) + return; + nfs4_set_rw_stateid(&data->args.stateid, data->args.context, + data->args.lock_context, FMODE_WRITE); } static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) @@ -3656,7 +3880,7 @@ static int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred, return -ENOMEM; data->client = clp; data->timestamp = jiffies; - return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT, + return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT, &nfs4_renew_ops, data); } @@ -3670,7 +3894,7 @@ static int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred) unsigned long now = jiffies; int status; - status = rpc_call_sync(clp->cl_rpcclient, &msg, 0); + status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); if (status < 0) return status; do_renew_lease(clp, now); @@ -3980,11 +4204,14 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, case -NFS4ERR_OPENMODE: if (state == NULL) break; - nfs4_schedule_stateid_recovery(server, state); + if (nfs4_schedule_stateid_recovery(server, state) < 0) + goto stateid_invalid; goto wait_on_recovery; case -NFS4ERR_EXPIRED: - if (state != NULL) - nfs4_schedule_stateid_recovery(server, state); + if (state != NULL) { + if (nfs4_schedule_stateid_recovery(server, state) < 0) + goto stateid_invalid; + } case -NFS4ERR_STALE_STATEID: case -NFS4ERR_STALE_CLIENTID: nfs4_schedule_lease_recovery(clp); @@ -4016,6 +4243,9 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, } task->tk_status = nfs4_map_errors(task->tk_status); return 0; +stateid_invalid: + task->tk_status = -EIO; + return 0; wait_on_recovery: rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) @@ -4143,27 +4373,17 @@ int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct nfs4_setclientid_res *arg, struct rpc_cred *cred) { - struct nfs_fsinfo fsinfo; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM], .rpc_argp = arg, - .rpc_resp = &fsinfo, .rpc_cred = cred, }; - unsigned long now; int status; dprintk("NFS call setclientid_confirm auth=%s, (client ID %llx)\n", clp->cl_rpcclient->cl_auth->au_ops->au_name, clp->cl_clientid); - now = jiffies; status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); - if (status == 0) { - spin_lock(&clp->cl_lock); - clp->cl_lease_time = fsinfo.lease_time * HZ; - clp->cl_last_renewal = now; - spin_unlock(&clp->cl_lock); - } dprintk("NFS reply setclientid_confirm: %d\n", status); return status; } @@ -4546,9 +4766,9 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * if (status != 0) goto out; /* Is this a delegated lock? */ - if (test_bit(NFS_DELEGATED_STATE, &state->flags)) - goto out; lsp = request->fl_u.nfs4_fl.owner; + if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) == 0) + goto out; seqid = nfs_alloc_seqid(&lsp->ls_seqid, GFP_KERNEL); status = -ENOMEM; if (seqid == NULL) @@ -4627,17 +4847,23 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) { goto out_release_lock_seqid; } - data->arg.open_stateid = &state->stateid; + data->arg.open_stateid = &state->open_stateid; data->arg.new_lock_owner = 1; data->res.open_seqid = data->arg.open_seqid; } else data->arg.new_lock_owner = 0; + if (!nfs4_valid_open_stateid(state)) { + data->rpc_status = -EBADF; + task->tk_action = NULL; + goto out_release_open_seqid; + } data->timestamp = jiffies; if (nfs4_setup_sequence(data->server, &data->arg.seq_args, &data->res.seq_res, task) == 0) return; +out_release_open_seqid: nfs_release_seqid(data->arg.open_seqid); out_release_lock_seqid: nfs_release_seqid(data->arg.lock_seqid); @@ -4983,58 +5209,16 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) return status; } -int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) +int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid) { struct nfs_server *server = NFS_SERVER(state->inode); - struct nfs4_exception exception = { }; int err; err = nfs4_set_lock_state(state, fl); if (err != 0) - goto out; - do { - err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW); - switch (err) { - default: - printk(KERN_ERR "NFS: %s: unhandled error " - "%d.\n", __func__, err); - case 0: - case -ESTALE: - goto out; - case -NFS4ERR_STALE_CLIENTID: - case -NFS4ERR_STALE_STATEID: - set_bit(NFS_DELEGATED_STATE, &state->flags); - case -NFS4ERR_EXPIRED: - nfs4_schedule_lease_recovery(server->nfs_client); - err = -EAGAIN; - goto out; - case -NFS4ERR_BADSESSION: - case -NFS4ERR_BADSLOT: - case -NFS4ERR_BAD_HIGH_SLOT: - case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: - case -NFS4ERR_DEADSESSION: - set_bit(NFS_DELEGATED_STATE, &state->flags); - nfs4_schedule_session_recovery(server->nfs_client->cl_session, err); - err = -EAGAIN; - goto out; - case -NFS4ERR_DELEG_REVOKED: - case -NFS4ERR_ADMIN_REVOKED: - case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_OPENMODE: - nfs4_schedule_stateid_recovery(server, state); - err = 0; - goto out; - case -ENOMEM: - case -NFS4ERR_DENIED: - /* kill_proc(fl->fl_pid, SIGLOST, 1); */ - err = 0; - goto out; - } - set_bit(NFS_DELEGATED_STATE, &state->flags); - err = nfs4_handle_exception(server, err, &exception); - } while (exception.retry); -out: - return err; + return err; + err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW); + return nfs4_handle_delegation_recall_error(server, state, stateid, err); } struct nfs_release_lockowner_data { @@ -5054,9 +5238,8 @@ static const struct rpc_call_ops nfs4_release_lockowner_ops = { .rpc_release = nfs4_release_lockowner_release, }; -int nfs4_release_lockowner(struct nfs4_lock_state *lsp) +static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp) { - struct nfs_server *server = lsp->ls_state->owner->so_server; struct nfs_release_lockowner_data *data; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER], @@ -5848,7 +6031,7 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, .rpc_client = clp->cl_rpcclient, .rpc_message = &msg, .callback_ops = &nfs41_sequence_ops, - .flags = RPC_TASK_ASYNC | RPC_TASK_SOFT, + .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT, }; if (!atomic_inc_not_zero(&clp->cl_count)) @@ -6416,22 +6599,8 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) static void nfs4_layoutcommit_release(void *calldata) { struct nfs4_layoutcommit_data *data = calldata; - struct pnfs_layout_segment *lseg, *tmp; - unsigned long *bitlock = &NFS_I(data->args.inode)->flags; pnfs_cleanup_layoutcommit(data); - /* Matched by references in pnfs_set_layoutcommit */ - list_for_each_entry_safe(lseg, tmp, &data->lseg_list, pls_lc_list) { - list_del_init(&lseg->pls_lc_list); - if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, - &lseg->pls_flags)) - pnfs_put_lseg(lseg); - } - - clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); - smp_mb__after_clear_bit(); - wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); - put_rpccred(data->cred); kfree(data); } @@ -6613,26 +6782,76 @@ static int nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid) return err; } -static int _nfs4_free_stateid(struct nfs_server *server, nfs4_stateid *stateid) -{ - struct nfs41_free_stateid_args args = { - .stateid = stateid, - }; +struct nfs_free_stateid_data { + struct nfs_server *server; + struct nfs41_free_stateid_args args; struct nfs41_free_stateid_res res; +}; + +static void nfs41_free_stateid_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs_free_stateid_data *data = calldata; + nfs41_setup_sequence(nfs4_get_session(data->server), + &data->args.seq_args, + &data->res.seq_res, + task); +} + +static void nfs41_free_stateid_done(struct rpc_task *task, void *calldata) +{ + struct nfs_free_stateid_data *data = calldata; + + nfs41_sequence_done(task, &data->res.seq_res); + + switch (task->tk_status) { + case -NFS4ERR_DELAY: + if (nfs4_async_handle_error(task, data->server, NULL) == -EAGAIN) + rpc_restart_call_prepare(task); + } +} + +static void nfs41_free_stateid_release(void *calldata) +{ + kfree(calldata); +} + +const struct rpc_call_ops nfs41_free_stateid_ops = { + .rpc_call_prepare = nfs41_free_stateid_prepare, + .rpc_call_done = nfs41_free_stateid_done, + .rpc_release = nfs41_free_stateid_release, +}; + +static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server, + nfs4_stateid *stateid, + bool privileged) +{ struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FREE_STATEID], - .rpc_argp = &args, - .rpc_resp = &res, }; - int status; + struct rpc_task_setup task_setup = { + .rpc_client = server->client, + .rpc_message = &msg, + .callback_ops = &nfs41_free_stateid_ops, + .flags = RPC_TASK_ASYNC, + }; + struct nfs_free_stateid_data *data; dprintk("NFS call free_stateid %p\n", stateid); - nfs41_init_sequence(&args.seq_args, &res.seq_res, 0); - nfs4_set_sequence_privileged(&args.seq_args); - status = nfs4_call_sync_sequence(server->client, server, &msg, - &args.seq_args, &res.seq_res); - dprintk("NFS reply free_stateid: %d\n", status); - return status; + data = kmalloc(sizeof(*data), GFP_NOFS); + if (!data) + return ERR_PTR(-ENOMEM); + data->server = server; + nfs4_stateid_copy(&data->args.stateid, stateid); + + task_setup.callback_data = data; + + msg.rpc_argp = &data->args; + msg.rpc_resp = &data->res; + nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); + if (privileged) + nfs4_set_sequence_privileged(&data->args.seq_args); + + return rpc_run_task(&task_setup); } /** @@ -6646,15 +6865,29 @@ static int _nfs4_free_stateid(struct nfs_server *server, nfs4_stateid *stateid) */ static int nfs41_free_stateid(struct nfs_server *server, nfs4_stateid *stateid) { - struct nfs4_exception exception = { }; - int err; - do { - err = _nfs4_free_stateid(server, stateid); - if (err != -NFS4ERR_DELAY) - break; - nfs4_handle_exception(server, err, &exception); - } while (exception.retry); - return err; + struct rpc_task *task; + int ret; + + task = _nfs41_free_stateid(server, stateid, true); + if (IS_ERR(task)) + return PTR_ERR(task); + ret = rpc_wait_for_completion_task(task); + if (!ret) + ret = task->tk_status; + rpc_put_task(task); + return ret; +} + +static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) +{ + struct rpc_task *task; + + task = _nfs41_free_stateid(server, &lsp->ls_stateid, false); + nfs4_free_lock_state(server, lsp); + if (IS_ERR(task)) + return PTR_ERR(task); + rpc_put_task(task); + return 0; } static bool nfs41_match_stateid(const nfs4_stateid *s1, @@ -6739,9 +6972,14 @@ static const struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = { static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { .minor_version = 0, + .init_caps = NFS_CAP_READDIRPLUS + | NFS_CAP_ATOMIC_OPEN + | NFS_CAP_CHANGE_ATTR + | NFS_CAP_POSIX_LOCK, .call_sync = _nfs4_call_sync, .match_stateid = nfs4_match_stateid, .find_root_sec = nfs4_find_root_sec, + .free_lock_state = nfs4_release_lockowner, .reboot_recovery_ops = &nfs40_reboot_recovery_ops, .nograce_recovery_ops = &nfs40_nograce_recovery_ops, .state_renewal_ops = &nfs40_state_renewal_ops, @@ -6750,9 +6988,16 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { #if defined(CONFIG_NFS_V4_1) static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { .minor_version = 1, + .init_caps = NFS_CAP_READDIRPLUS + | NFS_CAP_ATOMIC_OPEN + | NFS_CAP_CHANGE_ATTR + | NFS_CAP_POSIX_LOCK + | NFS_CAP_STATEID_NFSV41 + | NFS_CAP_ATOMIC_OPEN_V1, .call_sync = nfs4_call_sync_sequence, .match_stateid = nfs41_match_stateid, .find_root_sec = nfs41_find_root_sec, + .free_lock_state = nfs41_free_lock_state, .reboot_recovery_ops = &nfs41_reboot_recovery_ops, .nograce_recovery_ops = &nfs41_nograce_recovery_ops, .state_renewal_ops = &nfs41_state_renewal_ops, diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 6ace365c6334..300d17d85c0e 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -154,18 +154,6 @@ struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp) return cred; } -static void nfs4_clear_machine_cred(struct nfs_client *clp) -{ - struct rpc_cred *cred; - - spin_lock(&clp->cl_lock); - cred = clp->cl_machine_cred; - clp->cl_machine_cred = NULL; - spin_unlock(&clp->cl_lock); - if (cred != NULL) - put_rpccred(cred); -} - static struct rpc_cred * nfs4_get_renew_cred_server_locked(struct nfs_server *server) { @@ -699,6 +687,8 @@ __nfs4_find_state_byowner(struct inode *inode, struct nfs4_state_owner *owner) list_for_each_entry(state, &nfsi->open_states, inode_states) { if (state->owner != owner) continue; + if (!nfs4_valid_open_stateid(state)) + continue; if (atomic_inc_not_zero(&state->count)) return state; } @@ -931,6 +921,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_ */ void nfs4_put_lock_state(struct nfs4_lock_state *lsp) { + struct nfs_server *server; struct nfs4_state *state; if (lsp == NULL) @@ -942,11 +933,13 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp) if (list_empty(&state->lock_states)) clear_bit(LK_STATE_IN_USE, &state->flags); spin_unlock(&state->state_lock); + server = state->owner->so_server; if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) { - if (nfs4_release_lockowner(lsp) == 0) - return; - } - nfs4_free_lock_state(lsp->ls_state->owner->so_server, lsp); + struct nfs_client *clp = server->nfs_client; + + clp->cl_mvops->free_lock_state(server, lsp); + } else + nfs4_free_lock_state(server, lsp); } static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src) @@ -987,13 +980,14 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl) return 0; } -static bool nfs4_copy_lock_stateid(nfs4_stateid *dst, struct nfs4_state *state, +static int nfs4_copy_lock_stateid(nfs4_stateid *dst, + struct nfs4_state *state, const struct nfs_lockowner *lockowner) { struct nfs4_lock_state *lsp; fl_owner_t fl_owner; pid_t fl_pid; - bool ret = false; + int ret = -ENOENT; if (lockowner == NULL) @@ -1008,7 +1002,10 @@ static bool nfs4_copy_lock_stateid(nfs4_stateid *dst, struct nfs4_state *state, lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE); if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) { nfs4_stateid_copy(dst, &lsp->ls_stateid); - ret = true; + ret = 0; + smp_rmb(); + if (!list_empty(&lsp->ls_seqid.list)) + ret = -EWOULDBLOCK; } spin_unlock(&state->state_lock); nfs4_put_lock_state(lsp); @@ -1016,28 +1013,44 @@ out: return ret; } -static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) +static int nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) { + const nfs4_stateid *src; + int ret; int seq; do { + src = &zero_stateid; seq = read_seqbegin(&state->seqlock); - nfs4_stateid_copy(dst, &state->stateid); + if (test_bit(NFS_OPEN_STATE, &state->flags)) + src = &state->open_stateid; + nfs4_stateid_copy(dst, src); + ret = 0; + smp_rmb(); + if (!list_empty(&state->owner->so_seqid.list)) + ret = -EWOULDBLOCK; } while (read_seqretry(&state->seqlock, seq)); + return ret; } /* * Byte-range lock aware utility to initialize the stateid of read/write * requests. */ -void nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state, +int nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state, fmode_t fmode, const struct nfs_lockowner *lockowner) { + int ret = 0; if (nfs4_copy_delegation_stateid(dst, state->inode, fmode)) - return; - if (nfs4_copy_lock_stateid(dst, state, lockowner)) - return; - nfs4_copy_open_stateid(dst, state); + goto out; + ret = nfs4_copy_lock_stateid(dst, state, lockowner); + if (ret != -ENOENT) + goto out; + ret = nfs4_copy_open_stateid(dst, state); +out: + if (nfs_server_capable(state->inode, NFS_CAP_STATEID_NFSV41)) + dst->seqid = 0; + return ret; } struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask) @@ -1286,14 +1299,17 @@ static int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_s return 1; } -void nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_state *state) +int nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_state *state) { struct nfs_client *clp = server->nfs_client; + if (!nfs4_valid_open_stateid(state)) + return -EBADF; nfs4_state_mark_reclaim_nograce(clp, state); dprintk("%s: scheduling stateid recovery for server %s\n", __func__, clp->cl_hostname); nfs4_schedule_state_manager(clp); + return 0; } EXPORT_SYMBOL_GPL(nfs4_schedule_stateid_recovery); @@ -1323,6 +1339,27 @@ void nfs_inode_find_state_and_recover(struct inode *inode, nfs4_schedule_state_manager(clp); } +static void nfs4_state_mark_open_context_bad(struct nfs4_state *state) +{ + struct inode *inode = state->inode; + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_open_context *ctx; + + spin_lock(&inode->i_lock); + list_for_each_entry(ctx, &nfsi->open_files, list) { + if (ctx->state != state) + continue; + set_bit(NFS_CONTEXT_BAD, &ctx->flags); + } + spin_unlock(&inode->i_lock); +} + +static void nfs4_state_mark_recovery_failed(struct nfs4_state *state, int error) +{ + set_bit(NFS_STATE_RECOVERY_FAILED, &state->flags); + nfs4_state_mark_open_context_bad(state); +} + static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops) { @@ -1398,6 +1435,8 @@ restart: list_for_each_entry(state, &sp->so_states, open_states) { if (!test_and_clear_bit(ops->state_flag_bit, &state->flags)) continue; + if (!nfs4_valid_open_stateid(state)) + continue; if (state->state == 0) continue; atomic_inc(&state->count); @@ -1430,11 +1469,10 @@ restart: * Open state on this file cannot be recovered * All we can do is revert to using the zero stateid. */ - memset(&state->stateid, 0, - sizeof(state->stateid)); - /* Mark the file as being 'closed' */ - state->state = 0; + nfs4_state_mark_recovery_failed(state, status); break; + case -EAGAIN: + ssleep(1); case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_BAD_STATEID: @@ -1696,6 +1734,10 @@ static int nfs4_check_lease(struct nfs_client *clp) } status = ops->renew_lease(clp, cred); put_rpccred(cred); + if (status == -ETIMEDOUT) { + set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); + return 0; + } out: return nfs4_recovery_handle_error(clp, status); } @@ -1725,10 +1767,6 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status) clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); return -EPERM; case -EACCES: - if (clp->cl_machine_cred == NULL) - return -EACCES; - /* Handle case where the user hasn't set up machine creds */ - nfs4_clear_machine_cred(clp); case -NFS4ERR_DELAY: case -ETIMEDOUT: case -EAGAIN: @@ -1823,31 +1861,18 @@ int nfs4_discover_server_trunking(struct nfs_client *clp, { const struct nfs4_state_recovery_ops *ops = clp->cl_mvops->reboot_recovery_ops; - rpc_authflavor_t *flavors, flav, save; struct rpc_clnt *clnt; struct rpc_cred *cred; - int i, len, status; + int i, status; dprintk("NFS: %s: testing '%s'\n", __func__, clp->cl_hostname); - len = NFS_MAX_SECFLAVORS; - flavors = kcalloc(len, sizeof(*flavors), GFP_KERNEL); - if (flavors == NULL) { - status = -ENOMEM; - goto out; - } - len = rpcauth_list_flavors(flavors, len); - if (len < 0) { - status = len; - goto out_free; - } clnt = clp->cl_rpcclient; - save = clnt->cl_auth->au_flavor; i = 0; mutex_lock(&nfs_clid_init_mutex); - status = -ENOENT; again: + status = -ENOENT; cred = ops->get_clid_cred(clp); if (cred == NULL) goto out_unlock; @@ -1857,12 +1882,6 @@ again: switch (status) { case 0: break; - - case -EACCES: - if (clp->cl_machine_cred == NULL) - break; - /* Handle case where the user hasn't set up machine creds */ - nfs4_clear_machine_cred(clp); case -NFS4ERR_DELAY: case -ETIMEDOUT: case -EAGAIN: @@ -1871,22 +1890,23 @@ again: dprintk("NFS: %s after status %d, retrying\n", __func__, status); goto again; - + case -EACCES: + if (i++) + break; case -NFS4ERR_CLID_INUSE: case -NFS4ERR_WRONGSEC: - status = -EPERM; - if (i >= len) - break; - - flav = flavors[i++]; - if (flav == save) - flav = flavors[i++]; - clnt = rpc_clone_client_set_auth(clnt, flav); + clnt = rpc_clone_client_set_auth(clnt, RPC_AUTH_UNIX); if (IS_ERR(clnt)) { status = PTR_ERR(clnt); break; } - clp->cl_rpcclient = clnt; + /* Note: this is safe because we haven't yet marked the + * client as ready, so we are the only user of + * clp->cl_rpcclient + */ + clnt = xchg(&clp->cl_rpcclient, clnt); + rpc_shutdown_client(clnt); + clnt = clp->cl_rpcclient; goto again; case -NFS4ERR_MINOR_VERS_MISMATCH: @@ -1897,13 +1917,15 @@ again: case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery * in nfs4_exchange_id */ status = -EKEYEXPIRED; + break; + default: + pr_warn("NFS: %s unhandled error %d. Exiting with error EIO\n", + __func__, status); + status = -EIO; } out_unlock: mutex_unlock(&nfs_clid_init_mutex); -out_free: - kfree(flavors); -out: dprintk("NFS: %s: status = %d\n", __func__, status); return status; } diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index 569b166cc050..a5e1a3026d48 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -252,6 +252,8 @@ struct dentry *nfs4_try_mount(int flags, const char *dev_name, dfprintk(MOUNT, "--> nfs4_try_mount()\n"); + if (data->auth_flavors[0] == RPC_AUTH_MAXFLAVOR) + data->auth_flavors[0] = RPC_AUTH_UNIX; export_path = data->nfs_server.export_path; data->nfs_server.export_path = "/"; root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, mount_info, diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index e3edda554ac7..4be8d135ed61 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -530,14 +530,10 @@ static int nfs4_stat_to_errno(int); decode_setclientid_maxsz) #define NFS4_enc_setclientid_confirm_sz \ (compound_encode_hdr_maxsz + \ - encode_setclientid_confirm_maxsz + \ - encode_putrootfh_maxsz + \ - encode_fsinfo_maxsz) + encode_setclientid_confirm_maxsz) #define NFS4_dec_setclientid_confirm_sz \ (compound_decode_hdr_maxsz + \ - decode_setclientid_confirm_maxsz + \ - decode_putrootfh_maxsz + \ - decode_fsinfo_maxsz) + decode_setclientid_confirm_maxsz) #define NFS4_enc_lock_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ @@ -1058,8 +1054,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const if (iap->ia_valid & ATTR_ATIME_SET) { bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET; *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(iap->ia_atime.tv_sec); + p = xdr_encode_hyper(p, (s64)iap->ia_atime.tv_sec); *p++ = cpu_to_be32(iap->ia_atime.tv_nsec); } else if (iap->ia_valid & ATTR_ATIME) { @@ -1069,8 +1064,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const if (iap->ia_valid & ATTR_MTIME_SET) { bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET; *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(iap->ia_mtime.tv_sec); + p = xdr_encode_hyper(p, (s64)iap->ia_mtime.tv_sec); *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec); } else if (iap->ia_valid & ATTR_MTIME) { @@ -1366,33 +1360,28 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg) { + struct iattr dummy; __be32 *p; - struct nfs_client *clp; p = reserve_space(xdr, 4); - switch(arg->open_flags & O_EXCL) { - case 0: + switch(arg->createmode) { + case NFS4_CREATE_UNCHECKED: *p = cpu_to_be32(NFS4_CREATE_UNCHECKED); encode_attrs(xdr, arg->u.attrs, arg->server); break; - default: - clp = arg->server->nfs_client; - if (clp->cl_mvops->minor_version > 0) { - if (nfs4_has_persistent_session(clp)) { - *p = cpu_to_be32(NFS4_CREATE_GUARDED); - encode_attrs(xdr, arg->u.attrs, arg->server); - } else { - struct iattr dummy; - - *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1); - encode_nfs4_verifier(xdr, &arg->u.verifier); - dummy.ia_valid = 0; - encode_attrs(xdr, &dummy, arg->server); - } - } else { - *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); - encode_nfs4_verifier(xdr, &arg->u.verifier); - } + case NFS4_CREATE_GUARDED: + *p = cpu_to_be32(NFS4_CREATE_GUARDED); + encode_attrs(xdr, arg->u.attrs, arg->server); + break; + case NFS4_CREATE_EXCLUSIVE: + *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); + encode_nfs4_verifier(xdr, &arg->u.verifier); + break; + case NFS4_CREATE_EXCLUSIVE4_1: + *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1); + encode_nfs4_verifier(xdr, &arg->u.verifier); + dummy.ia_valid = 0; + encode_attrs(xdr, &dummy, arg->server); } } @@ -1459,6 +1448,23 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc encode_string(xdr, name->len, name->name); } +static inline void encode_claim_fh(struct xdr_stream *xdr) +{ + __be32 *p; + + p = reserve_space(xdr, 4); + *p = cpu_to_be32(NFS4_OPEN_CLAIM_FH); +} + +static inline void encode_claim_delegate_cur_fh(struct xdr_stream *xdr, const nfs4_stateid *stateid) +{ + __be32 *p; + + p = reserve_space(xdr, 4); + *p = cpu_to_be32(NFS4_OPEN_CLAIM_DELEG_CUR_FH); + encode_nfs4_stateid(xdr, stateid); +} + static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg, struct compound_hdr *hdr) { encode_op_hdr(xdr, OP_OPEN, decode_open_maxsz, hdr); @@ -1474,6 +1480,12 @@ static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg, case NFS4_OPEN_CLAIM_DELEGATE_CUR: encode_claim_delegate_cur(xdr, arg->name, &arg->u.delegation); break; + case NFS4_OPEN_CLAIM_FH: + encode_claim_fh(xdr); + break; + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: + encode_claim_delegate_cur_fh(xdr, &arg->u.delegation); + break; default: BUG(); } @@ -1506,35 +1518,12 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr) encode_op_hdr(xdr, OP_PUTROOTFH, decode_putrootfh_maxsz, hdr); } -static void encode_open_stateid(struct xdr_stream *xdr, - const struct nfs_open_context *ctx, - const struct nfs_lock_context *l_ctx, - fmode_t fmode, - int zero_seqid) -{ - nfs4_stateid stateid; - - if (ctx->state != NULL) { - const struct nfs_lockowner *lockowner = NULL; - - if (l_ctx != NULL) - lockowner = &l_ctx->lockowner; - nfs4_select_rw_stateid(&stateid, ctx->state, - fmode, lockowner); - if (zero_seqid) - stateid.seqid = 0; - encode_nfs4_stateid(xdr, &stateid); - } else - encode_nfs4_stateid(xdr, &zero_stateid); -} - static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr) { __be32 *p; encode_op_hdr(xdr, OP_READ, decode_read_maxsz, hdr); - encode_open_stateid(xdr, args->context, args->lock_context, - FMODE_READ, hdr->minorversion); + encode_nfs4_stateid(xdr, &args->stateid); p = reserve_space(xdr, 12); p = xdr_encode_hyper(p, args->offset); @@ -1670,8 +1659,7 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg __be32 *p; encode_op_hdr(xdr, OP_WRITE, decode_write_maxsz, hdr); - encode_open_stateid(xdr, args->context, args->lock_context, - FMODE_WRITE, hdr->minorversion); + encode_nfs4_stateid(xdr, &args->stateid); p = reserve_space(xdr, 16); p = xdr_encode_hyper(p, args->offset); @@ -2015,7 +2003,7 @@ static void encode_free_stateid(struct xdr_stream *xdr, struct compound_hdr *hdr) { encode_op_hdr(xdr, OP_FREE_STATEID, decode_free_stateid_maxsz, hdr); - encode_nfs4_stateid(xdr, args->stateid); + encode_nfs4_stateid(xdr, &args->stateid); } #endif /* CONFIG_NFS_V4_1 */ @@ -2609,12 +2597,9 @@ static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, struct compound_hdr hdr = { .nops = 0, }; - const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME }; encode_compound_hdr(xdr, req, &hdr); encode_setclientid_confirm(xdr, arg, &hdr); - encode_putrootfh(xdr, &hdr); - encode_fsinfo(xdr, lease_bitmap, &hdr); encode_nops(&hdr); } @@ -3497,8 +3482,11 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) if (n == 0) goto root_path; dprintk("pathname4: "); - path->ncomponents = 0; - while (path->ncomponents < n) { + if (n > NFS4_PATHNAME_MAXCOMPONENTS) { + dprintk("cannot parse %d components in path\n", n); + goto out_eio; + } + for (path->ncomponents = 0; path->ncomponents < n; path->ncomponents++) { struct nfs4_string *component = &path->components[path->ncomponents]; status = decode_opaque_inline(xdr, &component->len, &component->data); if (unlikely(status != 0)) @@ -3507,12 +3495,6 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) pr_cont("%s%.*s ", (path->ncomponents != n ? "/ " : ""), component->len, component->data); - if (path->ncomponents < NFS4_PATHNAME_MAXCOMPONENTS) - path->ncomponents++; - else { - dprintk("cannot parse %d components in path\n", n); - goto out_eio; - } } out: return status; @@ -3557,27 +3539,23 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st n = be32_to_cpup(p); if (n <= 0) goto out_eio; - res->nlocations = 0; - while (res->nlocations < n) { + for (res->nlocations = 0; res->nlocations < n; res->nlocations++) { u32 m; - struct nfs4_fs_location *loc = &res->locations[res->nlocations]; + struct nfs4_fs_location *loc; + if (res->nlocations == NFS4_FS_LOCATIONS_MAXENTRIES) + break; + loc = &res->locations[res->nlocations]; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; m = be32_to_cpup(p); - loc->nservers = 0; dprintk("%s: servers:\n", __func__); - while (loc->nservers < m) { - struct nfs4_string *server = &loc->servers[loc->nservers]; - status = decode_opaque_inline(xdr, &server->len, &server->data); - if (unlikely(status != 0)) - goto out_eio; - dprintk("%s ", server->data); - if (loc->nservers < NFS4_FS_LOCATION_MAXSERVERS) - loc->nservers++; - else { + for (loc->nservers = 0; loc->nservers < m; loc->nservers++) { + struct nfs4_string *server; + + if (loc->nservers == NFS4_FS_LOCATION_MAXSERVERS) { unsigned int i; dprintk("%s: using first %u of %u servers " "returned for location %u\n", @@ -3591,13 +3569,17 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st if (unlikely(status != 0)) goto out_eio; } + break; } + server = &loc->servers[loc->nservers]; + status = decode_opaque_inline(xdr, &server->len, &server->data); + if (unlikely(status != 0)) + goto out_eio; + dprintk("%s ", server->data); } status = decode_pathname(xdr, &loc->rootpath); if (unlikely(status != 0)) goto out_eio; - if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES) - res->nlocations++; } if (res->nlocations != 0) status = NFS_ATTR_FATTR_V4_LOCATIONS; @@ -5209,27 +5191,30 @@ static int decode_delegreturn(struct xdr_stream *xdr) return decode_op_hdr(xdr, OP_DELEGRETURN); } -static int decode_secinfo_gss(struct xdr_stream *xdr, struct nfs4_secinfo_flavor *flavor) +static int decode_secinfo_gss(struct xdr_stream *xdr, + struct nfs4_secinfo4 *flavor) { + u32 oid_len; __be32 *p; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - flavor->gss.sec_oid4.len = be32_to_cpup(p); - if (flavor->gss.sec_oid4.len > GSS_OID_MAX_LEN) + oid_len = be32_to_cpup(p); + if (oid_len > GSS_OID_MAX_LEN) goto out_err; - p = xdr_inline_decode(xdr, flavor->gss.sec_oid4.len); + p = xdr_inline_decode(xdr, oid_len); if (unlikely(!p)) goto out_overflow; - memcpy(flavor->gss.sec_oid4.data, p, flavor->gss.sec_oid4.len); + memcpy(flavor->flavor_info.oid.data, p, oid_len); + flavor->flavor_info.oid.len = oid_len; p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) goto out_overflow; - flavor->gss.qop4 = be32_to_cpup(p++); - flavor->gss.service = be32_to_cpup(p); + flavor->flavor_info.qop = be32_to_cpup(p++); + flavor->flavor_info.service = be32_to_cpup(p); return 0; @@ -5242,10 +5227,10 @@ out_err: static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res *res) { - struct nfs4_secinfo_flavor *sec_flavor; + struct nfs4_secinfo4 *sec_flavor; + unsigned int i, num_flavors; int status; __be32 *p; - int i, num_flavors; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) @@ -6648,8 +6633,7 @@ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, * Decode SETCLIENTID_CONFIRM response */ static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, - struct xdr_stream *xdr, - struct nfs_fsinfo *fsinfo) + struct xdr_stream *xdr) { struct compound_hdr hdr; int status; @@ -6657,10 +6641,6 @@ static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, status = decode_compound_hdr(xdr, &hdr); if (!status) status = decode_setclientid_confirm(xdr); - if (!status) - status = decode_putrootfh(xdr); - if (!status) - status = decode_fsinfo(xdr, fsinfo); return status; } diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 88f9611a945c..5457745dd4f1 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -234,7 +234,7 @@ static int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags, lseg = kzalloc(lseg_size, gfp_flags); if (unlikely(!lseg)) { - dprintk("%s: Faild allocation numdevs=%d size=%zd\n", __func__, + dprintk("%s: Failed allocation numdevs=%d size=%zd\n", __func__, numdevs, lseg_size); return -ENOMEM; } diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index 880ba086be94..87aa1dec6120 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h @@ -114,7 +114,7 @@ extern int objio_alloc_lseg(struct pnfs_layout_segment **outp, gfp_t gfp_flags); extern void objio_free_lseg(struct pnfs_layout_segment *lseg); -/* objio_free_result will free these @oir structs recieved from +/* objio_free_result will free these @oir structs received from * objlayout_{read,write}_done */ extern void objio_free_result(struct objlayout_io_res *oir); diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index e56e846e9d2d..29cfb7ade121 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -84,6 +84,55 @@ nfs_page_free(struct nfs_page *p) kmem_cache_free(nfs_page_cachep, p); } +static void +nfs_iocounter_inc(struct nfs_io_counter *c) +{ + atomic_inc(&c->io_count); +} + +static void +nfs_iocounter_dec(struct nfs_io_counter *c) +{ + if (atomic_dec_and_test(&c->io_count)) { + clear_bit(NFS_IO_INPROGRESS, &c->flags); + smp_mb__after_clear_bit(); + wake_up_bit(&c->flags, NFS_IO_INPROGRESS); + } +} + +static int +__nfs_iocounter_wait(struct nfs_io_counter *c) +{ + wait_queue_head_t *wq = bit_waitqueue(&c->flags, NFS_IO_INPROGRESS); + DEFINE_WAIT_BIT(q, &c->flags, NFS_IO_INPROGRESS); + int ret = 0; + + do { + prepare_to_wait(wq, &q.wait, TASK_KILLABLE); + set_bit(NFS_IO_INPROGRESS, &c->flags); + if (atomic_read(&c->io_count) == 0) + break; + ret = nfs_wait_bit_killable(&c->flags); + } while (atomic_read(&c->io_count) != 0); + finish_wait(wq, &q.wait); + return ret; +} + +/** + * nfs_iocounter_wait - wait for i/o to complete + * @c: nfs_io_counter to use + * + * returns -ERESTARTSYS if interrupted by a fatal signal. + * Otherwise returns 0 once the io_count hits 0. + */ +int +nfs_iocounter_wait(struct nfs_io_counter *c) +{ + if (atomic_read(&c->io_count) == 0) + return 0; + return __nfs_iocounter_wait(c); +} + /** * nfs_create_request - Create an NFS read/write request. * @ctx: open context to use @@ -104,6 +153,8 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, struct nfs_page *req; struct nfs_lock_context *l_ctx; + if (test_bit(NFS_CONTEXT_BAD, &ctx->flags)) + return ERR_PTR(-EBADF); /* try to allocate the request struct */ req = nfs_page_alloc(); if (req == NULL) @@ -116,6 +167,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, return ERR_CAST(l_ctx); } req->wb_lock_context = l_ctx; + nfs_iocounter_inc(&l_ctx->io_count); /* Initialize the request struct. Initially, we assume a * long write-back delay. This will be adjusted in @@ -175,6 +227,7 @@ static void nfs_clear_request(struct nfs_page *req) req->wb_page = NULL; } if (l_ctx != NULL) { + nfs_iocounter_dec(&l_ctx->io_count); nfs_put_lock_context(l_ctx); req->wb_lock_context = NULL; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 48ac5aad6258..c5bd758e5637 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -417,6 +417,16 @@ should_free_lseg(struct pnfs_layout_range *lseg_range, lo_seg_intersecting(lseg_range, recall_range); } +static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, + struct list_head *tmp_list) +{ + if (!atomic_dec_and_test(&lseg->pls_refcount)) + return false; + pnfs_layout_remove_lseg(lseg->pls_layout, lseg); + list_add(&lseg->pls_list, tmp_list); + return true; +} + /* Returns 1 if lseg is removed from list, 0 otherwise */ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, struct list_head *tmp_list) @@ -430,11 +440,8 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, */ dprintk("%s: lseg %p ref %d\n", __func__, lseg, atomic_read(&lseg->pls_refcount)); - if (atomic_dec_and_test(&lseg->pls_refcount)) { - pnfs_layout_remove_lseg(lseg->pls_layout, lseg); - list_add(&lseg->pls_list, tmp_list); + if (pnfs_lseg_dec_and_remove_zero(lseg, tmp_list)) rv = 1; - } } return rv; } @@ -711,6 +718,8 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, spin_lock(&lo->plh_inode->i_lock); if (pnfs_layoutgets_blocked(lo, 1)) { status = -EAGAIN; + } else if (!nfs4_valid_open_stateid(open_state)) { + status = -EBADF; } else if (list_empty(&lo->plh_segs)) { int seq; @@ -777,6 +786,21 @@ send_layoutget(struct pnfs_layout_hdr *lo, return lseg; } +static void pnfs_clear_layoutcommit(struct inode *inode, + struct list_head *head) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct pnfs_layout_segment *lseg, *tmp; + + if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) + return; + list_for_each_entry_safe(lseg, tmp, &nfsi->layout->plh_segs, pls_list) { + if (!test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) + continue; + pnfs_lseg_dec_and_remove_zero(lseg, head); + } +} + /* * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr * when the layout segment list is empty. @@ -808,6 +832,7 @@ _pnfs_return_layout(struct inode *ino) /* Reference matched in nfs4_layoutreturn_release */ pnfs_get_layout_hdr(lo); empty = list_empty(&lo->plh_segs); + pnfs_clear_layoutcommit(ino, &tmp_list); pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); /* Don't send a LAYOUTRETURN if list was initially empty */ if (empty) { @@ -820,8 +845,6 @@ _pnfs_return_layout(struct inode *ino) spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); - WARN_ON(test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)); - lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); if (unlikely(lrp == NULL)) { status = -ENOMEM; @@ -845,6 +868,33 @@ out: } EXPORT_SYMBOL_GPL(_pnfs_return_layout); +int +pnfs_commit_and_return_layout(struct inode *inode) +{ + struct pnfs_layout_hdr *lo; + int ret; + + spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; + if (lo == NULL) { + spin_unlock(&inode->i_lock); + return 0; + } + pnfs_get_layout_hdr(lo); + /* Block new layoutgets and read/write to ds */ + lo->plh_block_lgets++; + spin_unlock(&inode->i_lock); + filemap_fdatawait(inode->i_mapping); + ret = pnfs_layoutcommit_inode(inode, true); + if (ret == 0) + ret = _pnfs_return_layout(inode); + spin_lock(&inode->i_lock); + lo->plh_block_lgets--; + spin_unlock(&inode->i_lock); + pnfs_put_layout_hdr(lo); + return ret; +} + bool pnfs_roc(struct inode *ino) { struct pnfs_layout_hdr *lo; @@ -1458,7 +1508,6 @@ static void pnfs_ld_handle_write_error(struct nfs_write_data *data) dprintk("pnfs write error = %d\n", hdr->pnfs_error); if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & PNFS_LAYOUTRET_ON_ERROR) { - clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(hdr->inode)->flags); pnfs_return_layout(hdr->inode); } if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) @@ -1613,7 +1662,6 @@ static void pnfs_ld_handle_read_error(struct nfs_read_data *data) dprintk("pnfs read error = %d\n", hdr->pnfs_error); if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & PNFS_LAYOUTRET_ON_ERROR) { - clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(hdr->inode)->flags); pnfs_return_layout(hdr->inode); } if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) @@ -1746,11 +1794,27 @@ static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp) list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) { if (lseg->pls_range.iomode == IOMODE_RW && - test_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) + test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) list_add(&lseg->pls_lc_list, listp); } } +static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp) +{ + struct pnfs_layout_segment *lseg, *tmp; + unsigned long *bitlock = &NFS_I(inode)->flags; + + /* Matched by references in pnfs_set_layoutcommit */ + list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) { + list_del_init(&lseg->pls_lc_list); + pnfs_put_lseg(lseg); + } + + clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); + smp_mb__after_clear_bit(); + wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); +} + void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) { pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode); @@ -1795,6 +1859,7 @@ void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) if (nfss->pnfs_curr_ld->cleanup_layoutcommit) nfss->pnfs_curr_ld->cleanup_layoutcommit(data); + pnfs_list_write_lseg_done(data->args.inode, &data->lseg_list); } /* diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 94ba80417748..f5f8a470a647 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -219,6 +219,7 @@ void pnfs_set_layoutcommit(struct nfs_write_data *wdata); void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); int pnfs_layoutcommit_inode(struct inode *inode, bool sync); int _pnfs_return_layout(struct inode *); +int pnfs_commit_and_return_layout(struct inode *); void pnfs_ld_write_done(struct nfs_write_data *); void pnfs_ld_read_done(struct nfs_read_data *); struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, @@ -407,6 +408,11 @@ static inline int pnfs_return_layout(struct inode *ino) return 0; } +static inline int pnfs_commit_and_return_layout(struct inode *inode) +{ + return 0; +} + static inline bool pnfs_ld_layoutret_on_setattr(struct inode *inode) { diff --git a/fs/nfs/read.c b/fs/nfs/read.c index a5e5d9899d56..70a26c651f09 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -514,6 +514,8 @@ void nfs_read_prepare(struct rpc_task *task, void *calldata) { struct nfs_read_data *data = calldata; NFS_PROTO(data->header->inode)->read_rpc_prepare(task, data); + if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags))) + rpc_exit(task, -EIO); } static const struct rpc_call_ops nfs_read_common_ops = { diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 17b32b722457..a366107a7331 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -294,6 +294,7 @@ struct file_system_type nfs_fs_type = { .kill_sb = nfs_kill_super, .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, }; +MODULE_ALIAS_FS("nfs"); EXPORT_SYMBOL_GPL(nfs_fs_type); struct file_system_type nfs_xdev_fs_type = { @@ -333,6 +334,8 @@ struct file_system_type nfs4_fs_type = { .kill_sb = nfs_kill_super, .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, }; +MODULE_ALIAS_FS("nfs4"); +MODULE_ALIAS("nfs4"); EXPORT_SYMBOL_GPL(nfs4_fs_type); static int __init register_nfs4_fs(void) @@ -917,7 +920,7 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void) data->mount_server.port = NFS_UNSPEC_PORT; data->nfs_server.port = NFS_UNSPEC_PORT; data->nfs_server.protocol = XPRT_TRANSPORT_TCP; - data->auth_flavors[0] = RPC_AUTH_UNIX; + data->auth_flavors[0] = RPC_AUTH_MAXFLAVOR; data->auth_flavor_len = 1; data->minorversion = 0; data->need_mount = true; @@ -1605,48 +1608,91 @@ out_security_failure: } /* - * Match the requested auth flavors with the list returned by - * the server. Returns zero and sets the mount's authentication - * flavor on success; returns -EACCES if server does not support - * the requested flavor. + * Select a security flavor for this mount. The selected flavor + * is planted in args->auth_flavors[0]. + * + * Returns 0 on success, -EACCES on failure. */ -static int nfs_walk_authlist(struct nfs_parsed_mount_data *args, - struct nfs_mount_request *request) +static int nfs_select_flavor(struct nfs_parsed_mount_data *args, + struct nfs_mount_request *request) { - unsigned int i, j, server_authlist_len = *(request->auth_flav_len); + unsigned int i, count = *(request->auth_flav_len); + rpc_authflavor_t flavor; + + /* + * The NFSv2 MNT operation does not return a flavor list. + */ + if (args->mount_server.version != NFS_MNT3_VERSION) + goto out_default; /* * Certain releases of Linux's mountd return an empty - * flavor list. To prevent behavioral regression with - * these servers (ie. rejecting mounts that used to - * succeed), revert to pre-2.6.32 behavior (no checking) - * if the returned flavor list is empty. + * flavor list in some cases. */ - if (server_authlist_len == 0) - return 0; + if (count == 0) + goto out_default; /* - * We avoid sophisticated negotiating here, as there are - * plenty of cases where we can get it wrong, providing - * either too little or too much security. + * If the sec= mount option is used, the specified flavor or AUTH_NULL + * must be in the list returned by the server. * + * AUTH_NULL has a special meaning when it's in the server list - it + * means that the server will ignore the rpc creds, so any flavor + * can be used. + */ + if (args->auth_flavors[0] != RPC_AUTH_MAXFLAVOR) { + for (i = 0; i < count; i++) { + if (args->auth_flavors[0] == request->auth_flavs[i] || + request->auth_flavs[i] == RPC_AUTH_NULL) + goto out; + } + dfprintk(MOUNT, "NFS: auth flavor %d not supported by server\n", + args->auth_flavors[0]); + goto out_err; + } + + /* * RFC 2623, section 2.7 suggests we SHOULD prefer the * flavor listed first. However, some servers list - * AUTH_NULL first. Our caller plants AUTH_SYS, the - * preferred default, in args->auth_flavors[0] if user - * didn't specify sec= mount option. + * AUTH_NULL first. Avoid ever choosing AUTH_NULL. */ - for (i = 0; i < args->auth_flavor_len; i++) - for (j = 0; j < server_authlist_len; j++) - if (args->auth_flavors[i] == request->auth_flavs[j]) { - dfprintk(MOUNT, "NFS: using auth flavor %d\n", - request->auth_flavs[j]); - args->auth_flavors[0] = request->auth_flavs[j]; - return 0; - } + for (i = 0; i < count; i++) { + struct rpcsec_gss_info info; + + flavor = request->auth_flavs[i]; + switch (flavor) { + case RPC_AUTH_UNIX: + goto out_set; + case RPC_AUTH_NULL: + continue; + default: + if (rpcauth_get_gssinfo(flavor, &info) == 0) + goto out_set; + } + } + + /* + * As a last chance, see if the server list contains AUTH_NULL - + * if it does, use the default flavor. + */ + for (i = 0; i < count; i++) { + if (request->auth_flavs[i] == RPC_AUTH_NULL) + goto out_default; + } + + dfprintk(MOUNT, "NFS: no auth flavors in common with server\n"); + goto out_err; - dfprintk(MOUNT, "NFS: server does not support requested auth flavor\n"); - nfs_umount(request); +out_default: + /* use default if flavor not already set */ + flavor = (args->auth_flavors[0] == RPC_AUTH_MAXFLAVOR) ? + RPC_AUTH_UNIX : args->auth_flavors[0]; +out_set: + args->auth_flavors[0] = flavor; +out: + dfprintk(MOUNT, "NFS: using auth flavor %d\n", args->auth_flavors[0]); + return 0; +out_err: return -EACCES; } @@ -1710,12 +1756,7 @@ static int nfs_request_mount(struct nfs_parsed_mount_data *args, return status; } - /* - * MNTv1 (NFSv2) does not support auth flavor negotiation. - */ - if (args->mount_server.version != NFS_MNT3_VERSION) - return 0; - return nfs_walk_authlist(args, &request); + return nfs_select_flavor(args, &request); } struct dentry *nfs_try_mount(int flags, const char *dev_name, @@ -2378,10 +2419,9 @@ int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot, struct nfs_mount_info *mount_info) { /* clone any lsm security options from the parent to the new sb */ - security_sb_clone_mnt_opts(mount_info->cloned->sb, s); if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) return -ESTALE; - return 0; + return security_sb_clone_mnt_opts(mount_info->cloned->sb, s); } EXPORT_SYMBOL_GPL(nfs_clone_sb_security); @@ -2717,6 +2757,5 @@ module_param(send_implementation_id, ushort, 0644); MODULE_PARM_DESC(send_implementation_id, "Send implementation ID with NFSv4.1 exchange_id"); MODULE_PARM_DESC(nfs4_unique_id, "nfs_client_id4 uniquifier string"); -MODULE_ALIAS("nfs4"); #endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfs/write.c b/fs/nfs/write.c index c483cc50b82e..a2c7c28049d5 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1251,6 +1251,8 @@ void nfs_write_prepare(struct rpc_task *task, void *calldata) { struct nfs_write_data *data = calldata; NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data); + if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags))) + rpc_exit(task, -EIO); } void nfs_commit_prepare(struct rpc_task *task, void *calldata) diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index 87fd1410b737..d5c5b3e00266 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -82,6 +82,7 @@ int nfsd_reply_cache_init(void); void nfsd_reply_cache_shutdown(void); int nfsd_cache_lookup(struct svc_rqst *); void nfsd_cache_update(struct svc_rqst *, int, __be32 *); +int nfsd_reply_cache_stats_open(struct inode *, struct file *); #ifdef CONFIG_NFSD_V4 void nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp); diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 1051bebff1b0..849a7c3ced22 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -80,6 +80,7 @@ struct nfsd_net { */ struct list_head client_lru; struct list_head close_lru; + struct list_head del_recall_lru; struct delayed_work laundromat_work; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 99bc85ff0217..7f05cd140de3 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -37,6 +37,7 @@ #include "nfsd.h" #include "state.h" #include "netns.h" +#include "xdr4cb.h" #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -53,30 +54,6 @@ enum { NFSPROC4_CLNT_CB_SEQUENCE, }; -#define NFS4_MAXTAGLEN 20 - -#define NFS4_enc_cb_null_sz 0 -#define NFS4_dec_cb_null_sz 0 -#define cb_compound_enc_hdr_sz 4 -#define cb_compound_dec_hdr_sz (3 + (NFS4_MAXTAGLEN >> 2)) -#define sessionid_sz (NFS4_MAX_SESSIONID_LEN >> 2) -#define cb_sequence_enc_sz (sessionid_sz + 4 + \ - 1 /* no referring calls list yet */) -#define cb_sequence_dec_sz (op_dec_sz + sessionid_sz + 4) - -#define op_enc_sz 1 -#define op_dec_sz 2 -#define enc_nfs4_fh_sz (1 + (NFS4_FHSIZE >> 2)) -#define enc_stateid_sz (NFS4_STATEID_SIZE >> 2) -#define NFS4_enc_cb_recall_sz (cb_compound_enc_hdr_sz + \ - cb_sequence_enc_sz + \ - 1 + enc_stateid_sz + \ - enc_nfs4_fh_sz) - -#define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \ - cb_sequence_dec_sz + \ - op_dec_sz) - struct nfs4_cb_compound_hdr { /* args */ u32 ident; /* minorversion 0 only */ @@ -817,8 +794,7 @@ static bool nfsd41_cb_get_slot(struct nfs4_client *clp, struct rpc_task *task) static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) { struct nfsd4_callback *cb = calldata; - struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); - struct nfs4_client *clp = dp->dl_stid.sc_client; + struct nfs4_client *clp = cb->cb_clp; u32 minorversion = clp->cl_minorversion; cb->cb_minorversion = minorversion; @@ -839,8 +815,7 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) static void nfsd4_cb_done(struct rpc_task *task, void *calldata) { struct nfsd4_callback *cb = calldata; - struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); - struct nfs4_client *clp = dp->dl_stid.sc_client; + struct nfs4_client *clp = cb->cb_clp; dprintk("%s: minorversion=%d\n", __func__, clp->cl_minorversion); @@ -863,7 +838,7 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) { struct nfsd4_callback *cb = calldata; struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); - struct nfs4_client *clp = dp->dl_stid.sc_client; + struct nfs4_client *clp = cb->cb_clp; struct rpc_clnt *current_rpc_client = clp->cl_cb_client; nfsd4_cb_done(task, calldata); diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index ae73175e6e68..8ae5abfe6ba2 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -191,9 +191,18 @@ static __be32 nfsd_check_obj_isreg(struct svc_fh *fh) return nfserr_symlink; } +static void nfsd4_set_open_owner_reply_cache(struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh *resfh) +{ + if (nfsd4_has_session(cstate)) + return; + fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh, + &resfh->fh_handle); +} + static __be32 -do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) +do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open) { + struct svc_fh *current_fh = &cstate->current_fh; struct svc_fh *resfh; int accmode; __be32 status; @@ -252,9 +261,7 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o if (is_create_with_attrs(open) && open->op_acl != NULL) do_set_nfs4_acl(rqstp, resfh, open->op_acl, open->op_bmval); - /* set reply cache */ - fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh, - &resfh->fh_handle); + nfsd4_set_open_owner_reply_cache(cstate, open, resfh); accmode = NFSD_MAY_NOP; if (open->op_created) accmode |= NFSD_MAY_OWNER_OVERRIDE; @@ -268,8 +275,9 @@ out: } static __be32 -do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) +do_open_fhandle(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open) { + struct svc_fh *current_fh = &cstate->current_fh; __be32 status; /* We don't know the target directory, and therefore can not @@ -278,9 +286,7 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_ memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info)); - /* set replay cache */ - fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh, - ¤t_fh->fh_handle); + nfsd4_set_open_owner_reply_cache(cstate, open, current_fh); open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) && (open->op_iattr.ia_size == 0); @@ -351,6 +357,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } if (status) goto out; + if (open->op_xdr_error) { + status = open->op_xdr_error; + goto out; + } status = nfsd4_check_open_attributes(rqstp, cstate, open); if (status) @@ -368,8 +378,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, switch (open->op_claim_type) { case NFS4_OPEN_CLAIM_DELEGATE_CUR: case NFS4_OPEN_CLAIM_NULL: - status = do_open_lookup(rqstp, &cstate->current_fh, - open); + status = do_open_lookup(rqstp, cstate, open); if (status) goto out; break; @@ -382,8 +391,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; case NFS4_OPEN_CLAIM_FH: case NFS4_OPEN_CLAIM_DELEG_CUR_FH: - status = do_open_fhandle(rqstp, &cstate->current_fh, - open); + status = do_open_fhandle(rqstp, cstate, open); if (status) goto out; break; @@ -409,14 +417,33 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, WARN_ON(status && open->op_created); out: nfsd4_cleanup_open_state(open, status); - if (open->op_openowner) + if (open->op_openowner && !nfsd4_has_session(cstate)) cstate->replay_owner = &open->op_openowner->oo_owner; - else + nfsd4_bump_seqid(cstate, status); + if (!cstate->replay_owner) nfs4_unlock_state(); return status; } /* + * OPEN is the only seqid-mutating operation whose decoding can fail + * with a seqid-mutating error (specifically, decoding of user names in + * the attributes). Therefore we have to do some processing to look up + * the stateowner so that we can bump the seqid. + */ +static __be32 nfsd4_open_omfg(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_op *op) +{ + struct nfsd4_open *open = (struct nfsd4_open *)&op->u; + + if (!seqid_mutating_err(ntohl(op->status))) + return op->status; + if (nfsd4_has_session(cstate)) + return op->status; + open->op_xdr_error = op->status; + return nfsd4_open(rqstp, cstate, open); +} + +/* * filehandle-manipulating ops. */ static __be32 @@ -786,21 +813,11 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname, rename->rn_snamelen, &cstate->current_fh, rename->rn_tname, rename->rn_tnamelen); - - /* the underlying filesystem returns different error's than required - * by NFSv4. both save_fh and current_fh have been verified.. */ - if (status == nfserr_isdir) - status = nfserr_exist; - else if ((status == nfserr_notdir) && - (S_ISDIR(cstate->save_fh.fh_dentry->d_inode->i_mode) && - S_ISDIR(cstate->current_fh.fh_dentry->d_inode->i_mode))) - status = nfserr_exist; - - if (!status) { - set_change_info(&rename->rn_sinfo, &cstate->current_fh); - set_change_info(&rename->rn_tinfo, &cstate->save_fh); - } - return status; + if (status) + return status; + set_change_info(&rename->rn_sinfo, &cstate->current_fh); + set_change_info(&rename->rn_tinfo, &cstate->save_fh); + return nfs_ok; } static __be32 @@ -931,14 +948,14 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate, stateid, WR_STATE, &filp); - if (filp) - get_file(filp); - nfs4_unlock_state(); - if (status) { + nfs4_unlock_state(); dprintk("NFSD: nfsd4_write: couldn't process stateid!\n"); return status; } + if (filp) + get_file(filp); + nfs4_unlock_state(); cnt = write->wr_buflen; write->wr_how_written = write->wr_stable_how; @@ -1244,8 +1261,11 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, * for example, if there is a miscellaneous XDR error * it will be set to nfserr_bad_xdr. */ - if (op->status) + if (op->status) { + if (op->opnum == OP_OPEN) + op->status = nfsd4_open_omfg(rqstp, cstate, op); goto encode_op; + } /* We must be able to encode a successful response to * this operation, with enough room left over to encode a @@ -1282,12 +1302,9 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, if (op->status) goto encode_op; - if (opdesc->op_func) { - if (opdesc->op_get_currentstateid) - opdesc->op_get_currentstateid(cstate, &op->u); - op->status = opdesc->op_func(rqstp, cstate, &op->u); - } else - BUG_ON(op->status == nfs_ok); + if (opdesc->op_get_currentstateid) + opdesc->op_get_currentstateid(cstate, &op->u); + op->status = opdesc->op_func(rqstp, cstate, &op->u); if (!op->status) { if (opdesc->op_set_currentstateid) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 16d39c6c4fbb..316ec843dec2 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -42,6 +42,7 @@ #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/addr.h> #include "xdr4.h" +#include "xdr4cb.h" #include "vfs.h" #include "current_stateid.h" @@ -94,17 +95,32 @@ nfs4_lock_state(void) mutex_lock(&client_mutex); } -static void free_session(struct kref *); +static void free_session(struct nfsd4_session *); -/* Must be called under the client_lock */ -static void nfsd4_put_session_locked(struct nfsd4_session *ses) +void nfsd4_put_session(struct nfsd4_session *ses) +{ + atomic_dec(&ses->se_ref); +} + +static bool is_session_dead(struct nfsd4_session *ses) +{ + return ses->se_flags & NFS4_SESSION_DEAD; +} + +static __be32 mark_session_dead_locked(struct nfsd4_session *ses) { - kref_put(&ses->se_ref, free_session); + if (atomic_read(&ses->se_ref)) + return nfserr_jukebox; + ses->se_flags |= NFS4_SESSION_DEAD; + return nfs_ok; } -static void nfsd4_get_session(struct nfsd4_session *ses) +static __be32 nfsd4_get_session_locked(struct nfsd4_session *ses) { - kref_get(&ses->se_ref); + if (is_session_dead(ses)) + return nfserr_badsession; + atomic_inc(&ses->se_ref); + return nfs_ok; } void @@ -113,6 +129,90 @@ nfs4_unlock_state(void) mutex_unlock(&client_mutex); } +static bool is_client_expired(struct nfs4_client *clp) +{ + return clp->cl_time == 0; +} + +static __be32 mark_client_expired_locked(struct nfs4_client *clp) +{ + if (atomic_read(&clp->cl_refcount)) + return nfserr_jukebox; + clp->cl_time = 0; + return nfs_ok; +} + +static __be32 mark_client_expired(struct nfs4_client *clp) +{ + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + __be32 ret; + + spin_lock(&nn->client_lock); + ret = mark_client_expired_locked(clp); + spin_unlock(&nn->client_lock); + return ret; +} + +static __be32 get_client_locked(struct nfs4_client *clp) +{ + if (is_client_expired(clp)) + return nfserr_expired; + atomic_inc(&clp->cl_refcount); + return nfs_ok; +} + +/* must be called under the client_lock */ +static inline void +renew_client_locked(struct nfs4_client *clp) +{ + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + if (is_client_expired(clp)) { + WARN_ON(1); + printk("%s: client (clientid %08x/%08x) already expired\n", + __func__, + clp->cl_clientid.cl_boot, + clp->cl_clientid.cl_id); + return; + } + + dprintk("renewing client (clientid %08x/%08x)\n", + clp->cl_clientid.cl_boot, + clp->cl_clientid.cl_id); + list_move_tail(&clp->cl_lru, &nn->client_lru); + clp->cl_time = get_seconds(); +} + +static inline void +renew_client(struct nfs4_client *clp) +{ + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + spin_lock(&nn->client_lock); + renew_client_locked(clp); + spin_unlock(&nn->client_lock); +} + +static void put_client_renew_locked(struct nfs4_client *clp) +{ + if (!atomic_dec_and_test(&clp->cl_refcount)) + return; + if (!is_client_expired(clp)) + renew_client_locked(clp); +} + +void put_client_renew(struct nfs4_client *clp) +{ + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + if (!atomic_dec_and_lock(&clp->cl_refcount, &nn->client_lock)) + return; + if (!is_client_expired(clp)) + renew_client_locked(clp); + spin_unlock(&nn->client_lock); +} + + static inline u32 opaque_hashval(const void *ptr, int nbytes) { @@ -126,8 +226,6 @@ opaque_hashval(const void *ptr, int nbytes) return x; } -static struct list_head del_recall_lru; - static void nfsd4_free_file(struct nfs4_file *f) { kmem_cache_free(file_slab, f); @@ -137,7 +235,7 @@ static inline void put_nfs4_file(struct nfs4_file *fi) { if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) { - list_del(&fi->fi_hash); + hlist_del(&fi->fi_hash); spin_unlock(&recall_lock); iput(fi->fi_inode); nfsd4_free_file(fi); @@ -181,7 +279,7 @@ static unsigned int file_hashval(struct inode *ino) return hash_ptr(ino, FILE_HASH_BITS); } -static struct list_head file_hashtbl[FILE_HASH_SIZE]; +static struct hlist_head file_hashtbl[FILE_HASH_SIZE]; static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag) { @@ -210,13 +308,7 @@ static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag) { if (atomic_dec_and_test(&fp->fi_access[oflag])) { nfs4_file_put_fd(fp, oflag); - /* - * It's also safe to get rid of the RDWR open *if* - * we no longer have need of the other kind of access - * or if we already have the other kind of open: - */ - if (fp->fi_fds[1-oflag] - || atomic_read(&fp->fi_access[1 - oflag]) == 0) + if (atomic_read(&fp->fi_access[1 - oflag]) == 0) nfs4_file_put_fd(fp, O_RDWR); } } @@ -230,42 +322,10 @@ static void nfs4_file_put_access(struct nfs4_file *fp, int oflag) __nfs4_file_put_access(fp, oflag); } -static inline int get_new_stid(struct nfs4_stid *stid) -{ - static int min_stateid = 0; - struct idr *stateids = &stid->sc_client->cl_stateids; - int new_stid; - int error; - - error = idr_get_new_above(stateids, stid, min_stateid, &new_stid); - /* - * Note: the necessary preallocation was done in - * nfs4_alloc_stateid(). The idr code caps the number of - * preallocations that can exist at a time, but the state lock - * prevents anyone from using ours before we get here: - */ - WARN_ON_ONCE(error); - /* - * It shouldn't be a problem to reuse an opaque stateid value. - * I don't think it is for 4.1. But with 4.0 I worry that, for - * example, a stray write retransmission could be accepted by - * the server when it should have been rejected. Therefore, - * adopt a trick from the sctp code to attempt to maximize the - * amount of time until an id is reused, by ensuring they always - * "increase" (mod INT_MAX): - */ - - min_stateid = new_stid+1; - if (min_stateid == INT_MAX) - min_stateid = 0; - return new_stid; -} - static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab) { struct idr *stateids = &cl->cl_stateids; - static int min_stateid = 0; struct nfs4_stid *stid; int new_id; @@ -273,9 +333,8 @@ kmem_cache *slab) if (!stid) return NULL; - if (!idr_pre_get(stateids, GFP_KERNEL)) - goto out_free; - if (idr_get_new_above(stateids, stid, min_stateid, &new_id)) + new_id = idr_alloc_cyclic(stateids, stid, 0, 0, GFP_KERNEL); + if (new_id < 0) goto out_free; stid->sc_client = cl; stid->sc_type = 0; @@ -293,13 +352,9 @@ kmem_cache *slab) * amount of time until an id is reused, by ensuring they always * "increase" (mod INT_MAX): */ - - min_stateid = new_id+1; - if (min_stateid == INT_MAX) - min_stateid = 0; return stid; out_free: - kfree(stid); + kmem_cache_free(slab, stid); return NULL; } @@ -350,21 +405,18 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv return dp; } -static void free_stid(struct nfs4_stid *s, struct kmem_cache *slab) +static void remove_stid(struct nfs4_stid *s) { struct idr *stateids = &s->sc_client->cl_stateids; idr_remove(stateids, s->sc_stateid.si_opaque.so_id); - kmem_cache_free(slab, s); } void nfs4_put_delegation(struct nfs4_delegation *dp) { if (atomic_dec_and_test(&dp->dl_count)) { - dprintk("NFSD: freeing dp %p\n",dp); - put_nfs4_file(dp->dl_file); - free_stid(&dp->dl_stid, deleg_slab); + kmem_cache_free(deleg_slab, dp); num_delegations--; } } @@ -388,16 +440,45 @@ static void unhash_stid(struct nfs4_stid *s) static void unhash_delegation(struct nfs4_delegation *dp) { - unhash_stid(&dp->dl_stid); list_del_init(&dp->dl_perclnt); spin_lock(&recall_lock); list_del_init(&dp->dl_perfile); list_del_init(&dp->dl_recall_lru); spin_unlock(&recall_lock); nfs4_put_deleg_lease(dp->dl_file); + put_nfs4_file(dp->dl_file); + dp->dl_file = NULL; +} + + + +static void destroy_revoked_delegation(struct nfs4_delegation *dp) +{ + list_del_init(&dp->dl_recall_lru); + remove_stid(&dp->dl_stid); nfs4_put_delegation(dp); } +static void destroy_delegation(struct nfs4_delegation *dp) +{ + unhash_delegation(dp); + remove_stid(&dp->dl_stid); + nfs4_put_delegation(dp); +} + +static void revoke_delegation(struct nfs4_delegation *dp) +{ + struct nfs4_client *clp = dp->dl_stid.sc_client; + + if (clp->cl_minorversion == 0) + destroy_delegation(dp); + else { + unhash_delegation(dp); + dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID; + list_add(&dp->dl_recall_lru, &clp->cl_revoked); + } +} + /* * SETCLIENTID state */ @@ -538,7 +619,8 @@ static void close_generic_stateid(struct nfs4_ol_stateid *stp) static void free_generic_stateid(struct nfs4_ol_stateid *stp) { - free_stid(&stp->st_stid, stateid_slab); + remove_stid(&stp->st_stid); + kmem_cache_free(stateid_slab, stp); } static void release_lock_stateid(struct nfs4_ol_stateid *stp) @@ -654,6 +736,28 @@ dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid) } #endif +/* + * Bump the seqid on cstate->replay_owner, and clear replay_owner if it + * won't be used for replay. + */ +void nfsd4_bump_seqid(struct nfsd4_compound_state *cstate, __be32 nfserr) +{ + struct nfs4_stateowner *so = cstate->replay_owner; + + if (nfserr == nfserr_replay_me) + return; + + if (!seqid_mutating_err(ntohl(nfserr))) { + cstate->replay_owner = NULL; + return; + } + if (!so) + return; + if (so->so_is_open_owner) + release_last_closed_stateid(openowner(so)); + so->so_seqid++; + return; +} static void gen_sessionid(struct nfsd4_session *ses) @@ -694,17 +798,15 @@ free_session_slots(struct nfsd4_session *ses) * We don't actually need to cache the rpc and session headers, so we * can allocate a little less for each slot: */ -static inline int slot_bytes(struct nfsd4_channel_attrs *ca) -{ - return ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ; -} - -static int nfsd4_sanitize_slot_size(u32 size) +static inline u32 slot_bytes(struct nfsd4_channel_attrs *ca) { - size -= NFSD_MIN_HDR_SEQ_SZ; /* We don't cache the rpc header */ - size = min_t(u32, size, NFSD_SLOT_CACHE_SIZE); + u32 size; - return size; + if (ca->maxresp_cached < NFSD_MIN_HDR_SEQ_SZ) + size = 0; + else + size = ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ; + return size + sizeof(struct nfsd4_slot); } /* @@ -712,12 +814,12 @@ static int nfsd4_sanitize_slot_size(u32 size) * re-negotiate active sessions and reduce their slot usage to make * room for new connections. For now we just fail the create session. */ -static int nfsd4_get_drc_mem(int slotsize, u32 num) +static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca) { + u32 slotsize = slot_bytes(ca); + u32 num = ca->maxreqs; int avail; - num = min_t(u32, num, NFSD_MAX_SLOTS_PER_SESSION); - spin_lock(&nfsd_drc_lock); avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, nfsd_drc_max_mem - nfsd_drc_mem_used); @@ -728,15 +830,19 @@ static int nfsd4_get_drc_mem(int slotsize, u32 num) return num; } -static void nfsd4_put_drc_mem(int slotsize, int num) +static void nfsd4_put_drc_mem(struct nfsd4_channel_attrs *ca) { + int slotsize = slot_bytes(ca); + spin_lock(&nfsd_drc_lock); - nfsd_drc_mem_used -= slotsize * num; + nfsd_drc_mem_used -= slotsize * ca->maxreqs; spin_unlock(&nfsd_drc_lock); } -static struct nfsd4_session *__alloc_session(int slotsize, int numslots) +static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *attrs) { + int numslots = attrs->maxreqs; + int slotsize = slot_bytes(attrs); struct nfsd4_session *new; int mem, i; @@ -749,8 +855,7 @@ static struct nfsd4_session *__alloc_session(int slotsize, int numslots) return NULL; /* allocate each struct nfsd4_slot and data cache in one piece */ for (i = 0; i < numslots; i++) { - mem = sizeof(struct nfsd4_slot) + slotsize; - new->se_slots[i] = kzalloc(mem, GFP_KERNEL); + new->se_slots[i] = kzalloc(slotsize, GFP_KERNEL); if (!new->se_slots[i]) goto out_free; } @@ -762,21 +867,6 @@ out_free: return NULL; } -static void init_forechannel_attrs(struct nfsd4_channel_attrs *new, - struct nfsd4_channel_attrs *req, - int numslots, int slotsize, - struct nfsd_net *nn) -{ - u32 maxrpc = nn->nfsd_serv->sv_max_mesg; - - new->maxreqs = numslots; - new->maxresp_cached = min_t(u32, req->maxresp_cached, - slotsize + NFSD_MIN_HDR_SEQ_SZ); - new->maxreq_sz = min_t(u32, req->maxreq_sz, maxrpc); - new->maxresp_sz = min_t(u32, req->maxresp_sz, maxrpc); - new->maxops = min_t(u32, req->maxops, NFSD_MAX_OPS_PER_COMPOUND); -} - static void free_conn(struct nfsd4_conn *c) { svc_xprt_put(c->cn_xprt); @@ -793,8 +883,8 @@ static void nfsd4_conn_lost(struct svc_xpt_user *u) list_del(&c->cn_persession); free_conn(c); } - spin_unlock(&clp->cl_lock); nfsd4_probe_callback(clp); + spin_unlock(&clp->cl_lock); } static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags) @@ -878,59 +968,20 @@ static void nfsd4_del_conns(struct nfsd4_session *s) static void __free_session(struct nfsd4_session *ses) { - nfsd4_put_drc_mem(slot_bytes(&ses->se_fchannel), ses->se_fchannel.maxreqs); free_session_slots(ses); kfree(ses); } -static void free_session(struct kref *kref) +static void free_session(struct nfsd4_session *ses) { - struct nfsd4_session *ses; - struct nfsd_net *nn; - - ses = container_of(kref, struct nfsd4_session, se_ref); - nn = net_generic(ses->se_client->net, nfsd_net_id); + struct nfsd_net *nn = net_generic(ses->se_client->net, nfsd_net_id); lockdep_assert_held(&nn->client_lock); nfsd4_del_conns(ses); + nfsd4_put_drc_mem(&ses->se_fchannel); __free_session(ses); } -void nfsd4_put_session(struct nfsd4_session *ses) -{ - struct nfsd_net *nn = net_generic(ses->se_client->net, nfsd_net_id); - - spin_lock(&nn->client_lock); - nfsd4_put_session_locked(ses); - spin_unlock(&nn->client_lock); -} - -static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan, - struct nfsd_net *nn) -{ - struct nfsd4_session *new; - int numslots, slotsize; - /* - * Note decreasing slot size below client's request may - * make it difficult for client to function correctly, whereas - * decreasing the number of slots will (just?) affect - * performance. When short on memory we therefore prefer to - * decrease number of slots instead of their size. - */ - slotsize = nfsd4_sanitize_slot_size(fchan->maxresp_cached); - numslots = nfsd4_get_drc_mem(slotsize, fchan->maxreqs); - if (numslots < 1) - return NULL; - - new = __alloc_session(slotsize, numslots); - if (!new) { - nfsd4_put_drc_mem(slotsize, numslots); - return NULL; - } - init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize, nn); - return new; -} - static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, struct nfs4_client *clp, struct nfsd4_create_session *cses) { int idx; @@ -945,7 +996,7 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru new->se_flags = cses->flags; new->se_cb_prog = cses->callback_prog; new->se_cb_sec = cses->cb_sec; - kref_init(&new->se_ref); + atomic_set(&new->se_ref, 0); idx = hash_sessionid(&new->se_sessionid); spin_lock(&nn->client_lock); list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]); @@ -953,7 +1004,8 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru list_add(&new->se_perclnt, &clp->cl_sessions); spin_unlock(&clp->cl_lock); spin_unlock(&nn->client_lock); - + memcpy(&new->se_fchannel, &cses->fore_channel, + sizeof(struct nfsd4_channel_attrs)); if (cses->flags & SESSION4_BACK_CHAN) { struct sockaddr *sa = svc_addr(rqstp); /* @@ -1000,38 +1052,6 @@ unhash_session(struct nfsd4_session *ses) spin_unlock(&ses->se_client->cl_lock); } -/* must be called under the client_lock */ -static inline void -renew_client_locked(struct nfs4_client *clp) -{ - struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); - - if (is_client_expired(clp)) { - WARN_ON(1); - printk("%s: client (clientid %08x/%08x) already expired\n", - __func__, - clp->cl_clientid.cl_boot, - clp->cl_clientid.cl_id); - return; - } - - dprintk("renewing client (clientid %08x/%08x)\n", - clp->cl_clientid.cl_boot, - clp->cl_clientid.cl_id); - list_move_tail(&clp->cl_lru, &nn->client_lru); - clp->cl_time = get_seconds(); -} - -static inline void -renew_client(struct nfs4_client *clp) -{ - struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); - - spin_lock(&nn->client_lock); - renew_client_locked(clp); - spin_unlock(&nn->client_lock); -} - /* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */ static int STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn) @@ -1075,7 +1095,8 @@ free_client(struct nfs4_client *clp) ses = list_entry(clp->cl_sessions.next, struct nfsd4_session, se_perclnt); list_del(&ses->se_perclnt); - nfsd4_put_session_locked(ses); + WARN_ON_ONCE(atomic_read(&ses->se_ref)); + free_session(ses); } free_svc_cred(&clp->cl_cred); kfree(clp->cl_name.data); @@ -1083,29 +1104,12 @@ free_client(struct nfs4_client *clp) kfree(clp); } -void -release_session_client(struct nfsd4_session *session) -{ - struct nfs4_client *clp = session->se_client; - struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); - - if (!atomic_dec_and_lock(&clp->cl_refcount, &nn->client_lock)) - return; - if (is_client_expired(clp)) { - free_client(clp); - session->se_client = NULL; - } else - renew_client_locked(clp); - spin_unlock(&nn->client_lock); -} - /* must be called under the client_lock */ static inline void unhash_client_locked(struct nfs4_client *clp) { struct nfsd4_session *ses; - mark_client_expired(clp); list_del(&clp->cl_lru); spin_lock(&clp->cl_lock); list_for_each_entry(ses, &clp->cl_sessions, se_perclnt) @@ -1131,7 +1135,7 @@ destroy_client(struct nfs4_client *clp) spin_unlock(&recall_lock); while (!list_empty(&reaplist)) { dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); - unhash_delegation(dp); + destroy_delegation(dp); } while (!list_empty(&clp->cl_openowners)) { oo = list_entry(clp->cl_openowners.next, struct nfs4_openowner, oo_perclient); @@ -1147,8 +1151,8 @@ destroy_client(struct nfs4_client *clp) rb_erase(&clp->cl_namenode, &nn->unconf_name_tree); spin_lock(&nn->client_lock); unhash_client_locked(clp); - if (atomic_read(&clp->cl_refcount) == 0) - free_client(clp); + WARN_ON_ONCE(atomic_read(&clp->cl_refcount)); + free_client(clp); spin_unlock(&nn->client_lock); } @@ -1327,6 +1331,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, INIT_LIST_HEAD(&clp->cl_delegations); INIT_LIST_HEAD(&clp->cl_lru); INIT_LIST_HEAD(&clp->cl_callbacks); + INIT_LIST_HEAD(&clp->cl_revoked); spin_lock_init(&clp->cl_lock); nfsd4_init_callback(&clp->cl_cb_null); clp->cl_time = get_seconds(); @@ -1408,12 +1413,12 @@ move_to_confirmed(struct nfs4_client *clp) } static struct nfs4_client * -find_confirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn) +find_client_in_id_table(struct list_head *tbl, clientid_t *clid, bool sessions) { struct nfs4_client *clp; unsigned int idhashval = clientid_hashval(clid->cl_id); - list_for_each_entry(clp, &nn->conf_id_hashtbl[idhashval], cl_idhash) { + list_for_each_entry(clp, &tbl[idhashval], cl_idhash) { if (same_clid(&clp->cl_clientid, clid)) { if ((bool)clp->cl_minorversion != sessions) return NULL; @@ -1425,19 +1430,19 @@ find_confirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn) } static struct nfs4_client * +find_confirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn) +{ + struct list_head *tbl = nn->conf_id_hashtbl; + + return find_client_in_id_table(tbl, clid, sessions); +} + +static struct nfs4_client * find_unconfirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn) { - struct nfs4_client *clp; - unsigned int idhashval = clientid_hashval(clid->cl_id); + struct list_head *tbl = nn->unconf_id_hashtbl; - list_for_each_entry(clp, &nn->unconf_id_hashtbl[idhashval], cl_idhash) { - if (same_clid(&clp->cl_clientid, clid)) { - if ((bool)clp->cl_minorversion != sessions) - return NULL; - return clp; - } - } - return NULL; + return find_client_in_id_table(tbl, clid, sessions); } static bool clp_used_exchangeid(struct nfs4_client *clp) @@ -1641,6 +1646,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, default: /* checked by xdr code */ WARN_ON_ONCE(1); case SP4_SSV: + return nfserr_encr_alg_unsupp; case SP4_MACH_CRED: return nfserr_serverfault; /* no excuse :-/ */ } @@ -1782,10 +1788,55 @@ nfsd4_replay_create_session(struct nfsd4_create_session *cr_ses, /* seqid, slotID, slotID, slotID, status */ \ 5 ) * sizeof(__be32)) -static bool check_forechannel_attrs(struct nfsd4_channel_attrs fchannel) +static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfsd_net *nn) { - return fchannel.maxreq_sz < NFSD_MIN_REQ_HDR_SEQ_SZ - || fchannel.maxresp_sz < NFSD_MIN_RESP_HDR_SEQ_SZ; + u32 maxrpc = nn->nfsd_serv->sv_max_mesg; + + if (ca->maxreq_sz < NFSD_MIN_REQ_HDR_SEQ_SZ) + return nfserr_toosmall; + if (ca->maxresp_sz < NFSD_MIN_RESP_HDR_SEQ_SZ) + return nfserr_toosmall; + ca->headerpadsz = 0; + ca->maxreq_sz = min_t(u32, ca->maxreq_sz, maxrpc); + ca->maxresp_sz = min_t(u32, ca->maxresp_sz, maxrpc); + ca->maxops = min_t(u32, ca->maxops, NFSD_MAX_OPS_PER_COMPOUND); + ca->maxresp_cached = min_t(u32, ca->maxresp_cached, + NFSD_SLOT_CACHE_SIZE + NFSD_MIN_HDR_SEQ_SZ); + ca->maxreqs = min_t(u32, ca->maxreqs, NFSD_MAX_SLOTS_PER_SESSION); + /* + * Note decreasing slot size below client's request may make it + * difficult for client to function correctly, whereas + * decreasing the number of slots will (just?) affect + * performance. When short on memory we therefore prefer to + * decrease number of slots instead of their size. Clients that + * request larger slots than they need will get poor results: + */ + ca->maxreqs = nfsd4_get_drc_mem(ca); + if (!ca->maxreqs) + return nfserr_jukebox; + + return nfs_ok; +} + +static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca) +{ + ca->headerpadsz = 0; + + /* + * These RPC_MAX_HEADER macros are overkill, especially since we + * don't even do gss on the backchannel yet. But this is still + * less than 1k. Tighten up this estimate in the unlikely event + * it turns out to be a problem for some client: + */ + if (ca->maxreq_sz < NFS4_enc_cb_recall_sz + RPC_MAX_HEADER_WITH_AUTH) + return nfserr_toosmall; + if (ca->maxresp_sz < NFS4_dec_cb_recall_sz + RPC_MAX_REPHEADER_WITH_AUTH) + return nfserr_toosmall; + ca->maxresp_cached = 0; + if (ca->maxops < 2) + return nfserr_toosmall; + + return nfs_ok; } __be32 @@ -1803,12 +1854,16 @@ nfsd4_create_session(struct svc_rqst *rqstp, if (cr_ses->flags & ~SESSION4_FLAG_MASK_A) return nfserr_inval; - if (check_forechannel_attrs(cr_ses->fore_channel)) - return nfserr_toosmall; - new = alloc_session(&cr_ses->fore_channel, nn); - if (!new) - return nfserr_jukebox; + status = check_forechannel_attrs(&cr_ses->fore_channel, nn); + if (status) + return status; + status = check_backchannel_attrs(&cr_ses->back_channel); + if (status) + return status; status = nfserr_jukebox; + new = alloc_session(&cr_ses->fore_channel); + if (!new) + goto out_release_drc_mem; conn = alloc_conn_from_crses(rqstp, cr_ses); if (!conn) goto out_free_session; @@ -1816,6 +1871,7 @@ nfsd4_create_session(struct svc_rqst *rqstp, nfs4_lock_state(); unconf = find_unconfirmed_client(&cr_ses->clientid, true, nn); conf = find_confirmed_client(&cr_ses->clientid, true, nn); + WARN_ON_ONCE(conf && unconf); if (conf) { cs_slot = &conf->cl_cs_slot; @@ -1842,8 +1898,12 @@ nfsd4_create_session(struct svc_rqst *rqstp, goto out_free_conn; } old = find_confirmed_client_by_name(&unconf->cl_name, nn); - if (old) + if (old) { + status = mark_client_expired(old); + if (status) + goto out_free_conn; expire_client(old); + } move_to_confirmed(unconf); conf = unconf; } else { @@ -1862,23 +1922,21 @@ nfsd4_create_session(struct svc_rqst *rqstp, memcpy(cr_ses->sessionid.data, new->se_sessionid.data, NFS4_MAX_SESSIONID_LEN); - memcpy(&cr_ses->fore_channel, &new->se_fchannel, - sizeof(struct nfsd4_channel_attrs)); cs_slot->sl_seqid++; cr_ses->seqid = cs_slot->sl_seqid; /* cache solo and embedded create sessions under the state lock */ nfsd4_cache_create_session(cr_ses, cs_slot, status); nfs4_unlock_state(); -out: - dprintk("%s returns %d\n", __func__, ntohl(status)); return status; out_free_conn: nfs4_unlock_state(); free_conn(conn); out_free_session: __free_session(new); - goto out; +out_release_drc_mem: + nfsd4_put_drc_mem(&cr_ses->fore_channel); + return status; } static __be32 nfsd4_map_bcts_dir(u32 *dir) @@ -1916,30 +1974,30 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp, { __be32 status; struct nfsd4_conn *conn; + struct nfsd4_session *session; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); if (!nfsd4_last_compound_op(rqstp)) return nfserr_not_only_op; + nfs4_lock_state(); spin_lock(&nn->client_lock); - cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid, SVC_NET(rqstp)); - /* Sorta weird: we only need the refcnt'ing because new_conn acquires - * client_lock iself: */ - if (cstate->session) { - nfsd4_get_session(cstate->session); - atomic_inc(&cstate->session->se_client->cl_refcount); - } + session = find_in_sessionid_hashtbl(&bcts->sessionid, SVC_NET(rqstp)); spin_unlock(&nn->client_lock); - if (!cstate->session) - return nfserr_badsession; - + status = nfserr_badsession; + if (!session) + goto out; status = nfsd4_map_bcts_dir(&bcts->dir); if (status) - return status; + goto out; conn = alloc_conn(rqstp, bcts->dir); + status = nfserr_jukebox; if (!conn) - return nfserr_jukebox; - nfsd4_init_conn(rqstp, conn, cstate->session); - return nfs_ok; + goto out; + nfsd4_init_conn(rqstp, conn, session); + status = nfs_ok; +out: + nfs4_unlock_state(); + return status; } static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid) @@ -1955,42 +2013,36 @@ nfsd4_destroy_session(struct svc_rqst *r, struct nfsd4_destroy_session *sessionid) { struct nfsd4_session *ses; - __be32 status = nfserr_badsession; + __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(r), nfsd_net_id); - /* Notes: - * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid - * - Should we return nfserr_back_chan_busy if waiting for - * callbacks on to-be-destroyed session? - * - Do we need to clear any callback info from previous session? - */ - + nfs4_lock_state(); + status = nfserr_not_only_op; if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) { if (!nfsd4_last_compound_op(r)) - return nfserr_not_only_op; + goto out; } dump_sessionid(__func__, &sessionid->sessionid); spin_lock(&nn->client_lock); ses = find_in_sessionid_hashtbl(&sessionid->sessionid, SVC_NET(r)); - if (!ses) { - spin_unlock(&nn->client_lock); - goto out; - } - + status = nfserr_badsession; + if (!ses) + goto out_client_lock; + status = mark_session_dead_locked(ses); + if (status) + goto out_client_lock; unhash_session(ses); spin_unlock(&nn->client_lock); - nfs4_lock_state(); nfsd4_probe_callback_sync(ses->se_client); - nfs4_unlock_state(); spin_lock(&nn->client_lock); - nfsd4_del_conns(ses); - nfsd4_put_session_locked(ses); - spin_unlock(&nn->client_lock); + free_session(ses); status = nfs_ok; +out_client_lock: + spin_unlock(&nn->client_lock); out: - dprintk("%s returns %d\n", __func__, ntohl(status)); + nfs4_unlock_state(); return status; } @@ -2050,6 +2102,7 @@ nfsd4_sequence(struct svc_rqst *rqstp, { struct nfsd4_compoundres *resp = rqstp->rq_resp; struct nfsd4_session *session; + struct nfs4_client *clp; struct nfsd4_slot *slot; struct nfsd4_conn *conn; __be32 status; @@ -2070,19 +2123,26 @@ nfsd4_sequence(struct svc_rqst *rqstp, status = nfserr_badsession; session = find_in_sessionid_hashtbl(&seq->sessionid, SVC_NET(rqstp)); if (!session) - goto out; + goto out_no_session; + clp = session->se_client; + status = get_client_locked(clp); + if (status) + goto out_no_session; + status = nfsd4_get_session_locked(session); + if (status) + goto out_put_client; status = nfserr_too_many_ops; if (nfsd4_session_too_many_ops(rqstp, session)) - goto out; + goto out_put_session; status = nfserr_req_too_big; if (nfsd4_request_too_big(rqstp, session)) - goto out; + goto out_put_session; status = nfserr_badslot; if (seq->slotid >= session->se_fchannel.maxreqs) - goto out; + goto out_put_session; slot = session->se_slots[seq->slotid]; dprintk("%s: slotid %d\n", __func__, seq->slotid); @@ -2097,7 +2157,7 @@ nfsd4_sequence(struct svc_rqst *rqstp, if (status == nfserr_replay_cache) { status = nfserr_seq_misordered; if (!(slot->sl_flags & NFSD4_SLOT_INITIALIZED)) - goto out; + goto out_put_session; cstate->slot = slot; cstate->session = session; /* Return the cached reply status and set cstate->status @@ -2107,7 +2167,7 @@ nfsd4_sequence(struct svc_rqst *rqstp, goto out; } if (status) - goto out; + goto out_put_session; nfsd4_sequence_check_conn(conn, session); conn = NULL; @@ -2124,27 +2184,27 @@ nfsd4_sequence(struct svc_rqst *rqstp, cstate->session = session; out: - /* Hold a session reference until done processing the compound. */ - if (cstate->session) { - struct nfs4_client *clp = session->se_client; - - nfsd4_get_session(cstate->session); - atomic_inc(&clp->cl_refcount); - switch (clp->cl_cb_state) { - case NFSD4_CB_DOWN: - seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN; - break; - case NFSD4_CB_FAULT: - seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT; - break; - default: - seq->status_flags = 0; - } + switch (clp->cl_cb_state) { + case NFSD4_CB_DOWN: + seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN; + break; + case NFSD4_CB_FAULT: + seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT; + break; + default: + seq->status_flags = 0; } + if (!list_empty(&clp->cl_revoked)) + seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED; +out_no_session: kfree(conn); spin_unlock(&nn->client_lock); - dprintk("%s: return %d\n", __func__, ntohl(status)); return status; +out_put_session: + nfsd4_put_session(session); +out_put_client: + put_client_renew_locked(clp); + goto out_no_session; } __be32 @@ -2157,17 +2217,12 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta nfs4_lock_state(); unconf = find_unconfirmed_client(&dc->clientid, true, nn); conf = find_confirmed_client(&dc->clientid, true, nn); + WARN_ON_ONCE(conf && unconf); if (conf) { clp = conf; - if (!is_client_expired(conf) && client_has_state(conf)) { - status = nfserr_clientid_busy; - goto out; - } - - /* rfc5661 18.50.3 */ - if (cstate->session && conf == cstate->session->se_client) { + if (client_has_state(conf)) { status = nfserr_clientid_busy; goto out; } @@ -2181,7 +2236,6 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta expire_client(clp); out: nfs4_unlock_state(); - dprintk("%s return %d\n", __func__, ntohl(status)); return status; } @@ -2319,8 +2373,12 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, expire_client(unconf); } else { /* case 3: normal case; new or rebooted client */ conf = find_confirmed_client_by_name(&unconf->cl_name, nn); - if (conf) + if (conf) { + status = mark_client_expired(conf); + if (status) + goto out; expire_client(conf); + } move_to_confirmed(unconf); nfsd4_probe_callback(unconf); } @@ -2340,7 +2398,6 @@ static void nfsd4_init_file(struct nfs4_file *fp, struct inode *ino) unsigned int hashval = file_hashval(ino); atomic_set(&fp->fi_ref, 1); - INIT_LIST_HEAD(&fp->fi_hash); INIT_LIST_HEAD(&fp->fi_stateids); INIT_LIST_HEAD(&fp->fi_delegations); fp->fi_inode = igrab(ino); @@ -2349,7 +2406,7 @@ static void nfsd4_init_file(struct nfs4_file *fp, struct inode *ino) memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); memset(fp->fi_access, 0, sizeof(fp->fi_access)); spin_lock(&recall_lock); - list_add(&fp->fi_hash, &file_hashtbl[hashval]); + hlist_add_head(&fp->fi_hash, &file_hashtbl[hashval]); spin_unlock(&recall_lock); } @@ -2535,7 +2592,7 @@ find_file(struct inode *ino) struct nfs4_file *fp; spin_lock(&recall_lock); - list_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) { + hlist_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) { if (fp->fi_inode == ino) { get_nfs4_file(fp); spin_unlock(&recall_lock); @@ -2558,8 +2615,6 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type) struct nfs4_ol_stateid *stp; __be32 ret; - dprintk("NFSD: nfs4_share_conflict\n"); - fp = find_file(ino); if (!fp) return nfs_ok; @@ -2578,6 +2633,9 @@ out: static void nfsd_break_one_deleg(struct nfs4_delegation *dp) { + struct nfs4_client *clp = dp->dl_stid.sc_client; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + /* We're assuming the state code never drops its reference * without first removing the lease. Since we're in this lease * callback (and since the lease code is serialized by the kernel @@ -2585,7 +2643,7 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp) * it's safe to take a reference: */ atomic_inc(&dp->dl_count); - list_add_tail(&dp->dl_recall_lru, &del_recall_lru); + list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru); /* only place dl_time is set. protected by lock_flocks*/ dp->dl_time = get_seconds(); @@ -2731,7 +2789,7 @@ static bool nfsd4_is_deleg_cur(struct nfsd4_open *open) } static __be32 -nfs4_check_deleg(struct nfs4_client *cl, struct nfs4_file *fp, struct nfsd4_open *open, +nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open, struct nfs4_delegation **dp) { int flags; @@ -3056,7 +3114,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf if (fp) { if ((status = nfs4_check_open(fp, open, &stp))) goto out; - status = nfs4_check_deleg(cl, fp, open, &dp); + status = nfs4_check_deleg(cl, open, &dp); if (status) goto out; } else { @@ -3234,13 +3292,12 @@ nfs4_laundromat(struct nfsd_net *nn) clientid_val = t; break; } - if (atomic_read(&clp->cl_refcount)) { + if (mark_client_expired_locked(clp)) { dprintk("NFSD: client in use (clientid %08x)\n", clp->cl_clientid.cl_id); continue; } - unhash_client_locked(clp); - list_add(&clp->cl_lru, &reaplist); + list_move(&clp->cl_lru, &reaplist); } spin_unlock(&nn->client_lock); list_for_each_safe(pos, next, &reaplist) { @@ -3250,7 +3307,7 @@ nfs4_laundromat(struct nfsd_net *nn) expire_client(clp); } spin_lock(&recall_lock); - list_for_each_safe(pos, next, &del_recall_lru) { + list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); if (net_generic(dp->dl_stid.sc_client->net, nfsd_net_id) != nn) continue; @@ -3265,7 +3322,7 @@ nfs4_laundromat(struct nfsd_net *nn) spin_unlock(&recall_lock); list_for_each_safe(pos, next, &reaplist) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); - unhash_delegation(dp); + revoke_delegation(dp); } test_val = nn->nfsd4_lease; list_for_each_safe(pos, next, &nn->close_lru) { @@ -3308,16 +3365,6 @@ static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *s return nfs_ok; } -static int -STALE_STATEID(stateid_t *stateid, struct nfsd_net *nn) -{ - if (stateid->si_opaque.so_clid.cl_boot == nn->boot_time) - return 0; - dprintk("NFSD: stale stateid " STATEID_FMT "!\n", - STATEID_VAL(stateid)); - return 1; -} - static inline int access_permit_read(struct nfs4_ol_stateid *stp) { @@ -3434,13 +3481,24 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) status = check_stateid_generation(stateid, &s->sc_stateid, 1); if (status) return status; - if (!(s->sc_type & (NFS4_OPEN_STID | NFS4_LOCK_STID))) + switch (s->sc_type) { + case NFS4_DELEG_STID: + return nfs_ok; + case NFS4_REVOKED_DELEG_STID: + return nfserr_deleg_revoked; + case NFS4_OPEN_STID: + case NFS4_LOCK_STID: + ols = openlockstateid(s); + if (ols->st_stateowner->so_is_open_owner + && !(openowner(ols->st_stateowner)->oo_flags + & NFS4_OO_CONFIRMED)) + return nfserr_bad_stateid; return nfs_ok; - ols = openlockstateid(s); - if (ols->st_stateowner->so_is_open_owner - && !(openowner(ols->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED)) + default: + printk("unknown stateid type %x\n", s->sc_type); + case NFS4_CLOSED_STID: return nfserr_bad_stateid; - return nfs_ok; + } } static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, @@ -3448,19 +3506,20 @@ static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, struct nfsd_net *nn) { struct nfs4_client *cl; + __be32 status; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) return nfserr_bad_stateid; - if (STALE_STATEID(stateid, nn)) + status = lookup_clientid(&stateid->si_opaque.so_clid, sessions, + nn, &cl); + if (status == nfserr_stale_clientid) return nfserr_stale_stateid; - cl = find_confirmed_client(&stateid->si_opaque.so_clid, sessions, nn); - if (!cl) - return nfserr_expired; + if (status) + return status; *s = find_stateid_by_type(cl, stateid, typemask); if (!*s) return nfserr_bad_stateid; return nfs_ok; - } /* @@ -3570,6 +3629,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { stateid_t *stateid = &free_stateid->fr_stateid; struct nfs4_stid *s; + struct nfs4_delegation *dp; struct nfs4_client *cl = cstate->session->se_client; __be32 ret = nfserr_bad_stateid; @@ -3591,6 +3651,11 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, else ret = nfserr_locks_held; break; + case NFS4_REVOKED_DELEG_STID: + dp = delegstateid(s); + destroy_revoked_delegation(dp); + ret = nfs_ok; + break; default: ret = nfserr_bad_stateid; } @@ -3615,10 +3680,12 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_ status = nfsd4_check_seqid(cstate, sop, seqid); if (status) return status; - if (stp->st_stid.sc_type == NFS4_CLOSED_STID) + if (stp->st_stid.sc_type == NFS4_CLOSED_STID + || stp->st_stid.sc_type == NFS4_REVOKED_DELEG_STID) /* * "Closed" stateid's exist *only* to return - * nfserr_replay_me from the previous step. + * nfserr_replay_me from the previous step, and + * revoked delegations are kept only for free_stateid. */ return nfserr_bad_stateid; status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate)); @@ -3648,7 +3715,8 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, if (status) return status; *stpp = openlockstateid(s); - cstate->replay_owner = (*stpp)->st_stateowner; + if (!nfsd4_has_session(cstate)) + cstate->replay_owner = (*stpp)->st_stateowner; return nfs4_seqid_op_checks(cstate, stateid, seqid, *stpp); } @@ -3706,6 +3774,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfsd4_client_record_create(oo->oo_owner.so_client); status = nfs_ok; out: + nfsd4_bump_seqid(cstate, status); if (!cstate->replay_owner) nfs4_unlock_state(); return status; @@ -3789,31 +3858,12 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); status = nfs_ok; out: + nfsd4_bump_seqid(cstate, status); if (!cstate->replay_owner) nfs4_unlock_state(); return status; } -void nfsd4_purge_closed_stateid(struct nfs4_stateowner *so) -{ - struct nfs4_openowner *oo; - struct nfs4_ol_stateid *s; - - if (!so->so_is_open_owner) - return; - oo = openowner(so); - s = oo->oo_last_closed_stid; - if (!s) - return; - if (!(oo->oo_flags & NFS4_OO_PURGE_CLOSE)) { - /* Release the last_closed_stid on the next seqid bump: */ - oo->oo_flags |= NFS4_OO_PURGE_CLOSE; - return; - } - oo->oo_flags &= ~NFS4_OO_PURGE_CLOSE; - release_last_closed_stateid(oo); -} - static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) { unhash_open_stateid(s); @@ -3842,28 +3892,30 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &close->cl_stateid, NFS4_OPEN_STID|NFS4_CLOSED_STID, &stp, nn); + nfsd4_bump_seqid(cstate, status); if (status) goto out; oo = openowner(stp->st_stateowner); - status = nfs_ok; update_stateid(&stp->st_stid.sc_stateid); memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); nfsd4_close_open_stateid(stp); - release_last_closed_stateid(oo); - oo->oo_last_closed_stid = stp; + + if (cstate->minorversion) { + unhash_stid(&stp->st_stid); + free_generic_stateid(stp); + } else + oo->oo_last_closed_stid = stp; if (list_empty(&oo->oo_owner.so_stateids)) { - if (cstate->minorversion) { + if (cstate->minorversion) release_openowner(oo); - cstate->replay_owner = NULL; - } else { + else { /* * In the 4.0 case we need to keep the owners around a * little while to handle CLOSE replay. */ - if (list_empty(&oo->oo_owner.so_stateids)) - move_to_close_lru(oo, SVC_NET(rqstp)); + move_to_close_lru(oo, SVC_NET(rqstp)); } } out: @@ -3895,7 +3947,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto out; - unhash_delegation(dp); + destroy_delegation(dp); out: nfs4_unlock_state(); @@ -4273,6 +4325,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, out: if (status && new_state) release_lockowner(lock_sop); + nfsd4_bump_seqid(cstate, status); if (!cstate->replay_owner) nfs4_unlock_state(); if (file_lock) @@ -4382,6 +4435,7 @@ __be32 nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_locku *locku) { + struct nfs4_lockowner *lo; struct nfs4_ol_stateid *stp; struct file *filp = NULL; struct file_lock *file_lock = NULL; @@ -4414,9 +4468,10 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfserr_jukebox; goto out; } + lo = lockowner(stp->st_stateowner); locks_init_lock(file_lock); file_lock->fl_type = F_UNLCK; - file_lock->fl_owner = (fl_owner_t)lockowner(stp->st_stateowner); + file_lock->fl_owner = (fl_owner_t)lo; file_lock->fl_pid = current->tgid; file_lock->fl_file = filp; file_lock->fl_flags = FL_POSIX; @@ -4427,21 +4482,21 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, locku->lu_length); nfs4_transform_lock_offset(file_lock); - /* - * Try to unlock the file in the VFS. - */ err = vfs_lock_file(filp, F_SETLK, file_lock, NULL); if (err) { dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n"); goto out_nfserr; } - /* - * OK, unlock succeeded; the only thing left to do is update the stateid. - */ update_stateid(&stp->st_stid.sc_stateid); memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); + if (nfsd4_has_session(cstate) && !check_for_locks(stp->st_file, lo)) { + WARN_ON_ONCE(cstate->replay_owner); + release_lockowner(lo); + } + out: + nfsd4_bump_seqid(cstate, status); if (!cstate->replay_owner) nfs4_unlock_state(); if (file_lock) @@ -4634,6 +4689,8 @@ nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn) u64 nfsd_forget_client(struct nfs4_client *clp, u64 max) { + if (mark_client_expired(clp)) + return 0; expire_client(clp); return 1; } @@ -4740,7 +4797,7 @@ u64 nfsd_forget_client_delegations(struct nfs4_client *clp, u64 max) spin_unlock(&recall_lock); list_for_each_entry_safe(dp, next, &victims, dl_recall_lru) - unhash_delegation(dp); + revoke_delegation(dp); return count; } @@ -4812,12 +4869,6 @@ struct nfs4_client *nfsd_find_client(struct sockaddr_storage *addr, size_t addr_ void nfs4_state_init(void) { - int i; - - for (i = 0; i < FILE_HASH_SIZE; i++) { - INIT_LIST_HEAD(&file_hashtbl[i]); - } - INIT_LIST_HEAD(&del_recall_lru); } /* @@ -4881,6 +4932,7 @@ static int nfs4_state_create_net(struct net *net) nn->unconf_name_tree = RB_ROOT; INIT_LIST_HEAD(&nn->client_lru); INIT_LIST_HEAD(&nn->close_lru); + INIT_LIST_HEAD(&nn->del_recall_lru); spin_lock_init(&nn->client_lock); INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main); @@ -4993,16 +5045,14 @@ nfs4_state_shutdown_net(struct net *net) INIT_LIST_HEAD(&reaplist); spin_lock(&recall_lock); - list_for_each_safe(pos, next, &del_recall_lru) { + list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); - if (dp->dl_stid.sc_client->net != net) - continue; list_move(&dp->dl_recall_lru, &reaplist); } spin_unlock(&recall_lock); list_for_each_safe(pos, next, &reaplist) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); - unhash_delegation(dp); + destroy_delegation(dp); } nfsd4_client_tracking_exit(net); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 01168865dd37..6cd86e0fe450 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -264,7 +264,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, iattr->ia_valid |= ATTR_SIZE; } if (bmval[0] & FATTR4_WORD0_ACL) { - int nace; + u32 nace; struct nfs4_ace *ace; READ_BUF(4); len += 4; @@ -344,10 +344,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, all 32 bits of 'nseconds'. */ READ_BUF(12); len += 12; - READ32(dummy32); - if (dummy32) - return nfserr_inval; - READ32(iattr->ia_atime.tv_sec); + READ64(iattr->ia_atime.tv_sec); READ32(iattr->ia_atime.tv_nsec); if (iattr->ia_atime.tv_nsec >= (u32)1000000000) return nfserr_inval; @@ -370,10 +367,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, all 32 bits of 'nseconds'. */ READ_BUF(12); len += 12; - READ32(dummy32); - if (dummy32) - return nfserr_inval; - READ32(iattr->ia_mtime.tv_sec); + READ64(iattr->ia_mtime.tv_sec); READ32(iattr->ia_mtime.tv_nsec); if (iattr->ia_mtime.tv_nsec >= (u32)1000000000) return nfserr_inval; @@ -804,6 +798,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) open->op_iattr.ia_valid = 0; open->op_openowner = NULL; + open->op_xdr_error = 0; /* seqid, share_access, share_deny, clientid, ownerlen */ READ_BUF(4); READ32(open->op_seqid); @@ -1692,36 +1687,6 @@ static void write_cinfo(__be32 **p, struct nfsd4_change_info *c) } while (0) #define ADJUST_ARGS() resp->p = p -/* - * Header routine to setup seqid operation replay cache - */ -#define ENCODE_SEQID_OP_HEAD \ - __be32 *save; \ - \ - save = resp->p; - -/* - * Routine for encoding the result of a "seqid-mutating" NFSv4 operation. This - * is where sequence id's are incremented, and the replay cache is filled. - * Note that we increment sequence id's here, at the last moment, so we're sure - * we know whether the error to be returned is a sequence id mutating error. - */ - -static void encode_seqid_op_tail(struct nfsd4_compoundres *resp, __be32 *save, __be32 nfserr) -{ - struct nfs4_stateowner *stateowner = resp->cstate.replay_owner; - - if (seqid_mutating_err(ntohl(nfserr)) && stateowner) { - stateowner->so_seqid++; - stateowner->so_replay.rp_status = nfserr; - stateowner->so_replay.rp_buflen = - (char *)resp->p - (char *)save; - memcpy(stateowner->so_replay.rp_buf, save, - stateowner->so_replay.rp_buflen); - nfsd4_purge_closed_stateid(stateowner); - } -} - /* Encode as an array of strings the string given with components * separated @sep, escaped with esc_enter and esc_exit. */ @@ -2401,8 +2366,7 @@ out_acl: if (bmval1 & FATTR4_WORD1_TIME_ACCESS) { if ((buflen -= 12) < 0) goto out_resource; - WRITE32(0); - WRITE32(stat.atime.tv_sec); + WRITE64((s64)stat.atime.tv_sec); WRITE32(stat.atime.tv_nsec); } if (bmval1 & FATTR4_WORD1_TIME_DELTA) { @@ -2415,15 +2379,13 @@ out_acl: if (bmval1 & FATTR4_WORD1_TIME_METADATA) { if ((buflen -= 12) < 0) goto out_resource; - WRITE32(0); - WRITE32(stat.ctime.tv_sec); + WRITE64((s64)stat.ctime.tv_sec); WRITE32(stat.ctime.tv_nsec); } if (bmval1 & FATTR4_WORD1_TIME_MODIFY) { if ((buflen -= 12) < 0) goto out_resource; - WRITE32(0); - WRITE32(stat.mtime.tv_sec); + WRITE64((s64)stat.mtime.tv_sec); WRITE32(stat.mtime.tv_nsec); } if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) { @@ -2661,12 +2623,9 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, static __be32 nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close) { - ENCODE_SEQID_OP_HEAD; - if (!nfserr) nfsd4_encode_stateid(resp, &close->cl_stateid); - encode_seqid_op_tail(resp, save, nfserr); return nfserr; } @@ -2762,14 +2721,11 @@ nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denie static __be32 nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock) { - ENCODE_SEQID_OP_HEAD; - if (!nfserr) nfsd4_encode_stateid(resp, &lock->lk_resp_stateid); else if (nfserr == nfserr_denied) nfsd4_encode_lock_denied(resp, &lock->lk_denied); - encode_seqid_op_tail(resp, save, nfserr); return nfserr; } @@ -2784,12 +2740,9 @@ nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l static __be32 nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku) { - ENCODE_SEQID_OP_HEAD; - if (!nfserr) nfsd4_encode_stateid(resp, &locku->lu_stateid); - encode_seqid_op_tail(resp, save, nfserr); return nfserr; } @@ -2812,7 +2765,6 @@ static __be32 nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open) { __be32 *p; - ENCODE_SEQID_OP_HEAD; if (nfserr) goto out; @@ -2884,31 +2836,24 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op } /* XXX save filehandle here */ out: - encode_seqid_op_tail(resp, save, nfserr); return nfserr; } static __be32 nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc) { - ENCODE_SEQID_OP_HEAD; - if (!nfserr) nfsd4_encode_stateid(resp, &oc->oc_resp_stateid); - encode_seqid_op_tail(resp, save, nfserr); return nfserr; } static __be32 nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od) { - ENCODE_SEQID_OP_HEAD; - if (!nfserr) nfsd4_encode_stateid(resp, &od->od_stateid); - encode_seqid_op_tail(resp, save, nfserr); return nfserr; } @@ -3138,13 +3083,13 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ static __be32 nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp, - __be32 nfserr,struct svc_export *exp) + __be32 nfserr, struct svc_export *exp) { - int i = 0; - u32 nflavs; + u32 i, nflavs, supported; struct exp_flavor_info *flavs; struct exp_flavor_info def_flavs[2]; - __be32 *p; + __be32 *p, *flavorsp; + static bool report = true; if (nfserr) goto out; @@ -3168,34 +3113,40 @@ nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp, } } + supported = 0; RESERVE_SPACE(4); - WRITE32(nflavs); + flavorsp = p++; /* to be backfilled later */ ADJUST_ARGS(); + for (i = 0; i < nflavs; i++) { - u32 flav = flavs[i].pseudoflavor; - struct gss_api_mech *gm = gss_mech_get_by_pseudoflavor(flav); + rpc_authflavor_t pf = flavs[i].pseudoflavor; + struct rpcsec_gss_info info; - if (gm) { - RESERVE_SPACE(4); + if (rpcauth_get_gssinfo(pf, &info) == 0) { + supported++; + RESERVE_SPACE(4 + 4 + info.oid.len + 4 + 4); WRITE32(RPC_AUTH_GSS); + WRITE32(info.oid.len); + WRITEMEM(info.oid.data, info.oid.len); + WRITE32(info.qop); + WRITE32(info.service); ADJUST_ARGS(); - RESERVE_SPACE(4 + gm->gm_oid.len); - WRITE32(gm->gm_oid.len); - WRITEMEM(gm->gm_oid.data, gm->gm_oid.len); - ADJUST_ARGS(); - RESERVE_SPACE(4); - WRITE32(0); /* qop */ - ADJUST_ARGS(); + } else if (pf < RPC_AUTH_MAXFLAVOR) { + supported++; RESERVE_SPACE(4); - WRITE32(gss_pseudoflavor_to_service(gm, flav)); + WRITE32(pf); ADJUST_ARGS(); - gss_mech_put(gm); } else { - RESERVE_SPACE(4); - WRITE32(flav); - ADJUST_ARGS(); + if (report) + pr_warn("NFS: SECINFO: security flavor %u " + "is not supported\n", pf); } } + + if (nflavs != supported) + report = false; + *flavorsp = htonl(supported); + out: if (exp) exp_put(exp); @@ -3566,6 +3517,7 @@ __be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad) void nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) { + struct nfs4_stateowner *so = resp->cstate.replay_owner; __be32 *statp; __be32 *p; @@ -3582,6 +3534,11 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) /* nfsd4_check_drc_limit guarantees enough room for error status */ if (!op->status) op->status = nfsd4_check_resp_size(resp, 0); + if (so) { + so->so_replay.rp_status = op->status; + so->so_replay.rp_buflen = (char *)resp->p - (char *)(statp+1); + memcpy(so->so_replay.rp_buf, statp+1, so->so_replay.rp_buflen); + } status: /* * Note: We write the status directly, instead of using WRITE32(), @@ -3683,7 +3640,7 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo cs->slot->sl_flags &= ~NFSD4_SLOT_INUSE; } /* Renew the clientid on success and on replay */ - release_session_client(cs->session); + put_client_renew(cs->session->se_client); nfsd4_put_session(cs->session); } return 1; diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 62c1ee128aeb..e76244edd748 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -11,6 +11,8 @@ #include <linux/slab.h> #include <linux/sunrpc/addr.h> #include <linux/highmem.h> +#include <linux/log2.h> +#include <linux/hash.h> #include <net/checksum.h> #include "nfsd.h" @@ -18,30 +20,49 @@ #define NFSDDBG_FACILITY NFSDDBG_REPCACHE -#define HASHSIZE 64 +/* + * We use this value to determine the number of hash buckets from the max + * cache size, the idea being that when the cache is at its maximum number + * of entries, then this should be the average number of entries per bucket. + */ +#define TARGET_BUCKET_SIZE 64 static struct hlist_head * cache_hash; static struct list_head lru_head; static struct kmem_cache *drc_slab; -static unsigned int num_drc_entries; + +/* max number of entries allowed in the cache */ static unsigned int max_drc_entries; +/* number of significant bits in the hash value */ +static unsigned int maskbits; + /* - * Calculate the hash index from an XID. + * Stats and other tracking of on the duplicate reply cache. All of these and + * the "rc" fields in nfsdstats are protected by the cache_lock */ -static inline u32 request_hash(u32 xid) -{ - u32 h = xid; - h ^= (xid >> 24); - return h & (HASHSIZE-1); -} + +/* total number of entries */ +static unsigned int num_drc_entries; + +/* cache misses due only to checksum comparison failures */ +static unsigned int payload_misses; + +/* amount of memory (in bytes) currently consumed by the DRC */ +static unsigned int drc_mem_usage; + +/* longest hash chain seen */ +static unsigned int longest_chain; + +/* size of cache when we saw the longest hash chain */ +static unsigned int longest_chain_cachesize; static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec); static void cache_cleaner_func(struct work_struct *unused); static int nfsd_reply_cache_shrink(struct shrinker *shrink, struct shrink_control *sc); -struct shrinker nfsd_reply_cache_shrinker = { +static struct shrinker nfsd_reply_cache_shrinker = { .shrink = nfsd_reply_cache_shrink, .seeks = 1, }; @@ -82,6 +103,16 @@ nfsd_cache_size_limit(void) return min_t(unsigned int, limit, 256*1024); } +/* + * Compute the number of hash buckets we need. Divide the max cachesize by + * the "target" max bucket size, and round up to next power of two. + */ +static unsigned int +nfsd_hashsize(unsigned int limit) +{ + return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE); +} + static struct svc_cacherep * nfsd_reply_cache_alloc(void) { @@ -100,11 +131,15 @@ nfsd_reply_cache_alloc(void) static void nfsd_reply_cache_free_locked(struct svc_cacherep *rp) { - if (rp->c_type == RC_REPLBUFF) + if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) { + drc_mem_usage -= rp->c_replvec.iov_len; kfree(rp->c_replvec.iov_base); - hlist_del(&rp->c_hash); + } + if (!hlist_unhashed(&rp->c_hash)) + hlist_del(&rp->c_hash); list_del(&rp->c_lru); --num_drc_entries; + drc_mem_usage -= sizeof(*rp); kmem_cache_free(drc_slab, rp); } @@ -118,20 +153,24 @@ nfsd_reply_cache_free(struct svc_cacherep *rp) int nfsd_reply_cache_init(void) { + unsigned int hashsize; + + INIT_LIST_HEAD(&lru_head); + max_drc_entries = nfsd_cache_size_limit(); + num_drc_entries = 0; + hashsize = nfsd_hashsize(max_drc_entries); + maskbits = ilog2(hashsize); + register_shrinker(&nfsd_reply_cache_shrinker); drc_slab = kmem_cache_create("nfsd_drc", sizeof(struct svc_cacherep), 0, 0, NULL); if (!drc_slab) goto out_nomem; - cache_hash = kcalloc(HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL); + cache_hash = kcalloc(hashsize, sizeof(struct hlist_head), GFP_KERNEL); if (!cache_hash) goto out_nomem; - INIT_LIST_HEAD(&lru_head); - max_drc_entries = nfsd_cache_size_limit(); - num_drc_entries = 0; - return 0; out_nomem: printk(KERN_ERR "nfsd: failed to allocate reply cache\n"); @@ -179,7 +218,7 @@ static void hash_refile(struct svc_cacherep *rp) { hlist_del_init(&rp->c_hash); - hlist_add_head(&rp->c_hash, cache_hash + request_hash(rp->c_xid)); + hlist_add_head(&rp->c_hash, cache_hash + hash_32(rp->c_xid, maskbits)); } static inline bool @@ -272,6 +311,26 @@ nfsd_cache_csum(struct svc_rqst *rqstp) return csum; } +static bool +nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp) +{ + /* Check RPC header info first */ + if (rqstp->rq_xid != rp->c_xid || rqstp->rq_proc != rp->c_proc || + rqstp->rq_prot != rp->c_prot || rqstp->rq_vers != rp->c_vers || + rqstp->rq_arg.len != rp->c_len || + !rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) || + rpc_get_port(svc_addr(rqstp)) != rpc_get_port((struct sockaddr *)&rp->c_addr)) + return false; + + /* compare checksum of NFS data */ + if (csum != rp->c_csum) { + ++payload_misses; + return false; + } + + return true; +} + /* * Search the request hash for an entry that matches the given rqstp. * Must be called with cache_lock held. Returns the found entry or @@ -280,23 +339,30 @@ nfsd_cache_csum(struct svc_rqst *rqstp) static struct svc_cacherep * nfsd_cache_search(struct svc_rqst *rqstp, __wsum csum) { - struct svc_cacherep *rp; + struct svc_cacherep *rp, *ret = NULL; struct hlist_head *rh; - __be32 xid = rqstp->rq_xid; - u32 proto = rqstp->rq_prot, - vers = rqstp->rq_vers, - proc = rqstp->rq_proc; + unsigned int entries = 0; - rh = &cache_hash[request_hash(xid)]; + rh = &cache_hash[hash_32(rqstp->rq_xid, maskbits)]; hlist_for_each_entry(rp, rh, c_hash) { - if (xid == rp->c_xid && proc == rp->c_proc && - proto == rp->c_prot && vers == rp->c_vers && - rqstp->rq_arg.len == rp->c_len && csum == rp->c_csum && - rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) && - rpc_get_port(svc_addr(rqstp)) == rpc_get_port((struct sockaddr *)&rp->c_addr)) - return rp; + ++entries; + if (nfsd_cache_match(rqstp, csum, rp)) { + ret = rp; + break; + } } - return NULL; + + /* tally hash chain length stats */ + if (entries > longest_chain) { + longest_chain = entries; + longest_chain_cachesize = num_drc_entries; + } else if (entries == longest_chain) { + /* prefer to keep the smallest cachesize possible here */ + longest_chain_cachesize = min(longest_chain_cachesize, + num_drc_entries); + } + + return ret; } /* @@ -317,55 +383,55 @@ nfsd_cache_lookup(struct svc_rqst *rqstp) __wsum csum; unsigned long age; int type = rqstp->rq_cachetype; - int rtn; + int rtn = RC_DOIT; rqstp->rq_cacherep = NULL; if (type == RC_NOCACHE) { nfsdstats.rcnocache++; - return RC_DOIT; + return rtn; } csum = nfsd_cache_csum(rqstp); + /* + * Since the common case is a cache miss followed by an insert, + * preallocate an entry. First, try to reuse the first entry on the LRU + * if it works, then go ahead and prune the LRU list. + */ spin_lock(&cache_lock); - rtn = RC_DOIT; - - rp = nfsd_cache_search(rqstp, csum); - if (rp) - goto found_entry; - - /* Try to use the first entry on the LRU */ if (!list_empty(&lru_head)) { rp = list_first_entry(&lru_head, struct svc_cacherep, c_lru); if (nfsd_cache_entry_expired(rp) || num_drc_entries >= max_drc_entries) { lru_put_end(rp); prune_cache_entries(); - goto setup_entry; + goto search_cache; } } - /* Drop the lock and allocate a new entry */ + /* No expired ones available, allocate a new one. */ spin_unlock(&cache_lock); rp = nfsd_reply_cache_alloc(); - if (!rp) { - dprintk("nfsd: unable to allocate DRC entry!\n"); - return RC_DOIT; - } spin_lock(&cache_lock); - ++num_drc_entries; + if (likely(rp)) { + ++num_drc_entries; + drc_mem_usage += sizeof(*rp); + } - /* - * Must search again just in case someone inserted one - * after we dropped the lock above. - */ +search_cache: found = nfsd_cache_search(rqstp, csum); if (found) { - nfsd_reply_cache_free_locked(rp); + if (likely(rp)) + nfsd_reply_cache_free_locked(rp); rp = found; goto found_entry; } + if (!rp) { + dprintk("nfsd: unable to allocate DRC entry!\n"); + goto out; + } + /* * We're keeping the one we just allocated. Are we now over the * limit? Prune one off the tip of the LRU in trade for the one we @@ -375,7 +441,6 @@ nfsd_cache_lookup(struct svc_rqst *rqstp) nfsd_reply_cache_free_locked(list_first_entry(&lru_head, struct svc_cacherep, c_lru)); -setup_entry: nfsdstats.rcmisses++; rqstp->rq_cacherep = rp; rp->c_state = RC_INPROG; @@ -393,6 +458,7 @@ setup_entry: /* release any buffer */ if (rp->c_type == RC_REPLBUFF) { + drc_mem_usage -= rp->c_replvec.iov_len; kfree(rp->c_replvec.iov_base); rp->c_replvec.iov_base = NULL; } @@ -461,6 +527,7 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) struct svc_cacherep *rp = rqstp->rq_cacherep; struct kvec *resv = &rqstp->rq_res.head[0], *cachv; int len; + size_t bufsize = 0; if (!rp) return; @@ -482,19 +549,21 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) break; case RC_REPLBUFF: cachv = &rp->c_replvec; - cachv->iov_base = kmalloc(len << 2, GFP_KERNEL); + bufsize = len << 2; + cachv->iov_base = kmalloc(bufsize, GFP_KERNEL); if (!cachv->iov_base) { nfsd_reply_cache_free(rp); return; } - cachv->iov_len = len << 2; - memcpy(cachv->iov_base, statp, len << 2); + cachv->iov_len = bufsize; + memcpy(cachv->iov_base, statp, bufsize); break; case RC_NOCACHE: nfsd_reply_cache_free(rp); return; } spin_lock(&cache_lock); + drc_mem_usage += bufsize; lru_put_end(rp); rp->c_secure = rqstp->rq_secure; rp->c_type = cachetype; @@ -522,3 +591,30 @@ nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data) vec->iov_len += data->iov_len; return 1; } + +/* + * Note that fields may be added, removed or reordered in the future. Programs + * scraping this file for info should test the labels to ensure they're + * getting the correct field. + */ +static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v) +{ + spin_lock(&cache_lock); + seq_printf(m, "max entries: %u\n", max_drc_entries); + seq_printf(m, "num entries: %u\n", num_drc_entries); + seq_printf(m, "hash buckets: %u\n", 1 << maskbits); + seq_printf(m, "mem usage: %u\n", drc_mem_usage); + seq_printf(m, "cache hits: %u\n", nfsdstats.rchits); + seq_printf(m, "cache misses: %u\n", nfsdstats.rcmisses); + seq_printf(m, "not cached: %u\n", nfsdstats.rcnocache); + seq_printf(m, "payload misses: %u\n", payload_misses); + seq_printf(m, "longest chain len: %u\n", longest_chain); + seq_printf(m, "cachesize at longest: %u\n", longest_chain_cachesize); + spin_unlock(&cache_lock); + return 0; +} + +int nfsd_reply_cache_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, nfsd_reply_cache_stats_show, NULL); +} diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 13a21c8fca49..7f555179bf81 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -35,6 +35,7 @@ enum { NFSD_Threads, NFSD_Pool_Threads, NFSD_Pool_Stats, + NFSD_Reply_Cache_Stats, NFSD_Versions, NFSD_Ports, NFSD_MaxBlkSize, @@ -177,7 +178,7 @@ static int export_features_open(struct inode *inode, struct file *file) return single_open(file, export_features_show, NULL); } -static struct file_operations export_features_operations = { +static const struct file_operations export_features_operations = { .open = export_features_open, .read = seq_read, .llseek = seq_lseek, @@ -196,7 +197,7 @@ static int supported_enctypes_open(struct inode *inode, struct file *file) return single_open(file, supported_enctypes_show, NULL); } -static struct file_operations supported_enctypes_ops = { +static const struct file_operations supported_enctypes_ops = { .open = supported_enctypes_open, .read = seq_read, .llseek = seq_lseek, @@ -212,6 +213,13 @@ static const struct file_operations pool_stats_operations = { .owner = THIS_MODULE, }; +static struct file_operations reply_cache_stats_operations = { + .open = nfsd_reply_cache_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + /*----------------------------------------------------------------------------*/ /* * payload - write methods @@ -1047,6 +1055,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO}, + [NFSD_Reply_Cache_Stats] = {"reply_cache_stats", &reply_cache_stats_operations, S_IRUGO}, [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO}, [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, @@ -1090,6 +1099,7 @@ static struct file_system_type nfsd_fs_type = { .mount = nfsd_mount, .kill_sb = nfsd_umount, }; +MODULE_ALIAS_FS("nfsd"); #ifdef CONFIG_PROC_FS static int create_proc_exports_entry(void) @@ -1101,8 +1111,10 @@ static int create_proc_exports_entry(void) return -ENOMEM; entry = proc_create("exports", 0, entry, &exports_proc_operations); - if (!entry) + if (!entry) { + remove_proc_entry("fs/nfs", NULL); return -ENOMEM; + } return 0; } #else /* CONFIG_PROC_FS */ diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 1a8c7391f7ae..274e2a114e05 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -79,6 +79,8 @@ struct nfs4_stid { #define NFS4_DELEG_STID 4 /* For an open stateid kept around *only* to process close replays: */ #define NFS4_CLOSED_STID 8 +/* For a deleg stateid kept around only to process free_stateid's: */ +#define NFS4_REVOKED_DELEG_STID 16 unsigned char sc_type; stateid_t sc_stateid; struct nfs4_client *sc_client; @@ -194,9 +196,11 @@ struct nfsd4_conn { }; struct nfsd4_session { - struct kref se_ref; + atomic_t se_ref; struct list_head se_hash; /* hash by sessionid */ struct list_head se_perclnt; +/* See SESSION4_PERSIST, etc. for standard flags; this is internal-only: */ +#define NFS4_SESSION_DEAD 0x010 u32 se_flags; struct nfs4_client *se_client; struct nfs4_sessionid se_sessionid; @@ -236,6 +240,7 @@ struct nfs4_client { struct list_head cl_openowners; struct idr cl_stateids; /* stateid lookup */ struct list_head cl_delegations; + struct list_head cl_revoked; /* unacknowledged, revoked 4.1 state */ struct list_head cl_lru; /* tail queue */ struct xdr_netobj cl_name; /* id generated by client */ nfs4_verifier cl_verifier; /* generated by client */ @@ -286,18 +291,6 @@ struct nfs4_client { struct net *net; }; -static inline void -mark_client_expired(struct nfs4_client *clp) -{ - clp->cl_time = 0; -} - -static inline bool -is_client_expired(struct nfs4_client *clp) -{ - return clp->cl_time == 0; -} - /* struct nfs4_client_reset * one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl * upon lease reset, or from upcall to state_daemon (to read in state @@ -365,7 +358,6 @@ struct nfs4_openowner { struct nfs4_ol_stateid *oo_last_closed_stid; time_t oo_time; /* time of placement on so_close_lru */ #define NFS4_OO_CONFIRMED 1 -#define NFS4_OO_PURGE_CLOSE 2 #define NFS4_OO_NEW 4 unsigned char oo_flags; }; @@ -373,7 +365,7 @@ struct nfs4_openowner { struct nfs4_lockowner { struct nfs4_stateowner lo_owner; /* must be first element */ struct list_head lo_owner_ino_hash; /* hash by owner,file */ - struct list_head lo_perstateid; /* for lockowners only */ + struct list_head lo_perstateid; struct list_head lo_list; /* for temporary uses */ }; @@ -390,7 +382,7 @@ static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so) /* nfs4_file: a file opened by some number of (open) nfs4_stateowners. */ struct nfs4_file { atomic_t fi_ref; - struct list_head fi_hash; /* hash by "struct inode *" */ + struct hlist_node fi_hash; /* hash by "struct inode *" */ struct list_head fi_stateids; struct list_head fi_delegations; /* One each for O_RDONLY, O_WRONLY, O_RDWR: */ @@ -486,8 +478,7 @@ extern void nfs4_put_delegation(struct nfs4_delegation *dp); extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name, struct nfsd_net *nn); extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn); -extern void release_session_client(struct nfsd4_session *); -extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *); +extern void put_client_renew(struct nfs4_client *clp); /* nfs4recover operations */ extern int nfsd4_client_tracking_init(struct net *net); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 2a7eb536de0b..84ce601d8063 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1013,6 +1013,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, int host_err; int stable = *stablep; int use_wgather; + loff_t pos = offset; dentry = file->f_path.dentry; inode = dentry->d_inode; @@ -1025,7 +1026,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, /* Write the data. */ oldfs = get_fs(); set_fs(KERNEL_DS); - host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset); + host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos); set_fs(oldfs); if (host_err < 0) goto out_nfserr; @@ -1757,10 +1758,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, tdentry = tfhp->fh_dentry; tdir = tdentry->d_inode; - err = (rqstp->rq_vers == 2) ? nfserr_acces : nfserr_xdev; - if (ffhp->fh_export != tfhp->fh_export) - goto out; - err = nfserr_perm; if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen)) goto out; @@ -1801,6 +1798,8 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, host_err = -EXDEV; if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt) goto out_dput_new; + if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry) + goto out_dput_new; host_err = nfsd_break_lease(odentry->d_inode); if (host_err) diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 546f8983ecf1..3b271d2092b6 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -184,7 +184,6 @@ struct nfsd4_lock { #define lk_old_lock_stateid v.old.lock_stateid #define lk_old_lock_seqid v.old.lock_seqid -#define lk_rflags u.ok.rflags #define lk_resp_stateid u.ok.stateid #define lk_denied u.denied @@ -237,6 +236,7 @@ struct nfsd4_open { u32 op_share_deny; /* request */ u32 op_deleg_want; /* request */ stateid_t op_stateid; /* response */ + __be32 op_xdr_error; /* see nfsd4_open_omfg() */ u32 op_recall; /* recall */ struct nfsd4_change_info op_cinfo; /* response */ u32 op_rflags; /* response */ @@ -623,6 +623,7 @@ extern __be32 nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_test_stateid *test_stateid); extern __be32 nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_free_stateid *free_stateid); +extern void nfsd4_bump_seqid(struct nfsd4_compound_state *, __be32 nfserr); #endif /* diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h new file mode 100644 index 000000000000..c5c55dfb91a9 --- /dev/null +++ b/fs/nfsd/xdr4cb.h @@ -0,0 +1,23 @@ +#define NFS4_MAXTAGLEN 20 + +#define NFS4_enc_cb_null_sz 0 +#define NFS4_dec_cb_null_sz 0 +#define cb_compound_enc_hdr_sz 4 +#define cb_compound_dec_hdr_sz (3 + (NFS4_MAXTAGLEN >> 2)) +#define sessionid_sz (NFS4_MAX_SESSIONID_LEN >> 2) +#define cb_sequence_enc_sz (sessionid_sz + 4 + \ + 1 /* no referring calls list yet */) +#define cb_sequence_dec_sz (op_dec_sz + sessionid_sz + 4) + +#define op_enc_sz 1 +#define op_dec_sz 2 +#define enc_nfs4_fh_sz (1 + (NFS4_FHSIZE >> 2)) +#define enc_stateid_sz (NFS4_STATEID_SIZE >> 2) +#define NFS4_enc_cb_recall_sz (cb_compound_enc_hdr_sz + \ + cb_sequence_enc_sz + \ + 1 + enc_stateid_sz + \ + enc_nfs4_fh_sz) + +#define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ + op_dec_sz) diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 6b49f14eac8c..689fb608648e 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -25,7 +25,7 @@ #include <linux/gfp.h> #include <linux/mpage.h> #include <linux/writeback.h> -#include <linux/uio.h> +#include <linux/aio.h> #include "nilfs.h" #include "btnode.h" #include "segment.h" @@ -175,6 +175,11 @@ static int nilfs_writepages(struct address_space *mapping, struct inode *inode = mapping->host; int err = 0; + if (inode->i_sb->s_flags & MS_RDONLY) { + nilfs_clear_dirty_pages(mapping, false); + return -EROFS; + } + if (wbc->sync_mode == WB_SYNC_ALL) err = nilfs_construct_dsync_segment(inode->i_sb, inode, wbc->range_start, @@ -187,6 +192,18 @@ static int nilfs_writepage(struct page *page, struct writeback_control *wbc) struct inode *inode = page->mapping->host; int err; + if (inode->i_sb->s_flags & MS_RDONLY) { + /* + * It means that filesystem was remounted in read-only + * mode because of error or metadata corruption. But we + * have dirty pages that try to be flushed in background. + * So, here we simply discard this dirty page. + */ + nilfs_clear_dirty_page(page, false); + unlock_page(page); + return -EROFS; + } + redirty_page_for_writepage(wbc, page); unlock_page(page); diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index f9897d09c693..c4dcd1db57ee 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -375,14 +375,25 @@ int nilfs_mdt_fetch_dirty(struct inode *inode) static int nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) { - struct inode *inode; + struct inode *inode = page->mapping->host; struct super_block *sb; int err = 0; + if (inode && (inode->i_sb->s_flags & MS_RDONLY)) { + /* + * It means that filesystem was remounted in read-only + * mode because of error or metadata corruption. But we + * have dirty pages that try to be flushed in background. + * So, here we simply discard this dirty page. + */ + nilfs_clear_dirty_page(page, false); + unlock_page(page); + return -EROFS; + } + redirty_page_for_writepage(wbc, page); unlock_page(page); - inode = page->mapping->host; if (!inode) return 0; @@ -561,10 +572,10 @@ void nilfs_mdt_restore_from_shadow_map(struct inode *inode) if (mi->mi_palloc_cache) nilfs_palloc_clear_cache(inode); - nilfs_clear_dirty_pages(inode->i_mapping); + nilfs_clear_dirty_pages(inode->i_mapping, true); nilfs_copy_back_pages(inode->i_mapping, &shadow->frozen_data); - nilfs_clear_dirty_pages(&ii->i_btnode_cache); + nilfs_clear_dirty_pages(&ii->i_btnode_cache, true); nilfs_copy_back_pages(&ii->i_btnode_cache, &shadow->frozen_btnodes); nilfs_bmap_restore(ii->i_bmap, &shadow->bmap_store); diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 07f76db04ec7..0ba679866e50 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -370,7 +370,12 @@ repeat: goto repeat; } -void nilfs_clear_dirty_pages(struct address_space *mapping) +/** + * nilfs_clear_dirty_pages - discard dirty pages in address space + * @mapping: address space with dirty pages for discarding + * @silent: suppress [true] or print [false] warning messages + */ +void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent) { struct pagevec pvec; unsigned int i; @@ -382,25 +387,9 @@ void nilfs_clear_dirty_pages(struct address_space *mapping) PAGEVEC_SIZE)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - struct buffer_head *bh, *head; lock_page(page); - ClearPageUptodate(page); - ClearPageMappedToDisk(page); - bh = head = page_buffers(page); - do { - lock_buffer(bh); - clear_buffer_dirty(bh); - clear_buffer_nilfs_volatile(bh); - clear_buffer_nilfs_checked(bh); - clear_buffer_nilfs_redirected(bh); - clear_buffer_uptodate(bh); - clear_buffer_mapped(bh); - unlock_buffer(bh); - bh = bh->b_this_page; - } while (bh != head); - - __nilfs_clear_page_dirty(page); + nilfs_clear_dirty_page(page, silent); unlock_page(page); } pagevec_release(&pvec); @@ -408,6 +397,51 @@ void nilfs_clear_dirty_pages(struct address_space *mapping) } } +/** + * nilfs_clear_dirty_page - discard dirty page + * @page: dirty page that will be discarded + * @silent: suppress [true] or print [false] warning messages + */ +void nilfs_clear_dirty_page(struct page *page, bool silent) +{ + struct inode *inode = page->mapping->host; + struct super_block *sb = inode->i_sb; + + BUG_ON(!PageLocked(page)); + + if (!silent) { + nilfs_warning(sb, __func__, + "discard page: offset %lld, ino %lu", + page_offset(page), inode->i_ino); + } + + ClearPageUptodate(page); + ClearPageMappedToDisk(page); + + if (page_has_buffers(page)) { + struct buffer_head *bh, *head; + + bh = head = page_buffers(page); + do { + lock_buffer(bh); + if (!silent) { + nilfs_warning(sb, __func__, + "discard block %llu, size %zu", + (u64)bh->b_blocknr, bh->b_size); + } + clear_buffer_dirty(bh); + clear_buffer_nilfs_volatile(bh); + clear_buffer_nilfs_checked(bh); + clear_buffer_nilfs_redirected(bh); + clear_buffer_uptodate(bh); + clear_buffer_mapped(bh); + unlock_buffer(bh); + } while (bh = bh->b_this_page, bh != head); + } + + __nilfs_clear_page_dirty(page); +} + unsigned nilfs_page_count_clean_buffers(struct page *page, unsigned from, unsigned to) { diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h index fb7de71605a0..ef30c5c2426f 100644 --- a/fs/nilfs2/page.h +++ b/fs/nilfs2/page.h @@ -55,7 +55,8 @@ void nilfs_page_bug(struct page *); int nilfs_copy_dirty_pages(struct address_space *, struct address_space *); void nilfs_copy_back_pages(struct address_space *, struct address_space *); -void nilfs_clear_dirty_pages(struct address_space *); +void nilfs_clear_dirty_page(struct page *, bool); +void nilfs_clear_dirty_pages(struct address_space *, bool); void nilfs_mapping_init(struct address_space *mapping, struct inode *inode, struct backing_dev_info *bdi); unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 3c991dc84f2f..c7d1f9f18b09 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1361,6 +1361,7 @@ struct file_system_type nilfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("nilfs2"); static void nilfs_inode_init_once(void *obj) { diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 5d8444268a16..d0be29fa94cf 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -755,9 +755,9 @@ out_destroy_group: return fd; } -SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags, - __u64 mask, int dfd, - const char __user * pathname) +SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, + __u64, mask, int, dfd, + const char __user *, pathname) { struct inode *inode = NULL; struct vfsmount *mnt = NULL; @@ -857,17 +857,6 @@ fput_and_out: return ret; } -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_fanotify_mark(long fanotify_fd, long flags, __u64 mask, - long dfd, long pathname) -{ - return SYSC_fanotify_mark((int) fanotify_fd, (unsigned int) flags, - mask, (int) dfd, - (const char __user *) pathname); -} -SYSCALL_ALIAS(sys_fanotify_mark, SyS_fanotify_mark); -#endif - /* * fanotify_user_setup - Our initialization function. Note that we cannot return * error because we have compiled-in VFS hooks. So an (unlikely) failure here diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index e0f7c1241a6a..959815c1e017 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -287,9 +287,6 @@ static int inotify_release(struct inode *ignored, struct file *file) pr_debug("%s: group=%p\n", __func__, group); - if (file->f_flags & FASYNC) - fsnotify_fasync(-1, file, 0); - /* free this group, matching get was inotify_init->fsnotify_obtain_group */ fsnotify_destroy_group(group); @@ -359,7 +356,6 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns } static int inotify_add_to_idr(struct idr *idr, spinlock_t *idr_lock, - int *last_wd, struct inotify_inode_mark *i_mark) { int ret; @@ -367,11 +363,10 @@ static int inotify_add_to_idr(struct idr *idr, spinlock_t *idr_lock, idr_preload(GFP_KERNEL); spin_lock(idr_lock); - ret = idr_alloc(idr, i_mark, *last_wd + 1, 0, GFP_NOWAIT); + ret = idr_alloc_cyclic(idr, i_mark, 1, 0, GFP_NOWAIT); if (ret >= 0) { /* we added the mark to the idr, take a reference */ i_mark->wd = ret; - *last_wd = i_mark->wd; fsnotify_get_mark(&i_mark->fsn_mark); } @@ -572,7 +567,6 @@ static int inotify_update_existing_watch(struct fsnotify_group *group, int add = (arg & IN_MASK_ADD); int ret; - /* don't allow invalid bits: we don't want flags set */ mask = inotify_arg_to_mask(arg); fsn_mark = fsnotify_find_inode_mark(group, inode); @@ -623,7 +617,6 @@ static int inotify_new_watch(struct fsnotify_group *group, struct idr *idr = &group->inotify_data.idr; spinlock_t *idr_lock = &group->inotify_data.idr_lock; - /* don't allow invalid bits: we don't want flags set */ mask = inotify_arg_to_mask(arg); tmp_i_mark = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL); @@ -638,8 +631,7 @@ static int inotify_new_watch(struct fsnotify_group *group, if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) goto out_err; - ret = inotify_add_to_idr(idr, idr_lock, &group->inotify_data.last_wd, - tmp_i_mark); + ret = inotify_add_to_idr(idr, idr_lock, tmp_i_mark); if (ret) goto out_err; @@ -697,7 +689,6 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events) spin_lock_init(&group->inotify_data.idr_lock); idr_init(&group->inotify_data.idr); - group->inotify_data.last_wd = 0; group->inotify_data.user = get_current_user(); if (atomic_inc_return(&group->inotify_data.user->inotify_devs) > @@ -751,6 +742,10 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, int ret; unsigned flags = 0; + /* don't allow invalid bits: we don't want flags set */ + if (unlikely(!(mask & ALL_INOTIFY_BITS))) + return -EINVAL; + f = fdget(fd); if (unlikely(!f.file)) return -EBADF; diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 5b2d4f0853ac..c5670b8d198c 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -27,6 +27,7 @@ #include <linux/swap.h> #include <linux/uio.h> #include <linux/writeback.h> +#include <linux/aio.h> #include <asm/page.h> #include <asm/uaccess.h> @@ -2129,7 +2130,6 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, BUG_ON(iocb->ki_pos != pos); - sb_start_write(inode->i_sb); mutex_lock(&inode->i_mutex); ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); @@ -2138,7 +2138,6 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (err < 0) ret = err; } - sb_end_write(inode->i_sb); return ret; } diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index d3e118cc6ffa..2778b0255dc6 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -28,6 +28,7 @@ #include <linux/quotaops.h> #include <linux/slab.h> #include <linux/log2.h> +#include <linux/aio.h> #include "aops.h" #include "attrib.h" diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 4a8289f8b16c..82650d52d916 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -3079,6 +3079,7 @@ static struct file_system_type ntfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("ntfs"); /* Stable names for the slab caches. */ static const char ntfs_index_ctx_cache_name[] = "ntfs_index_ctx_cache"; diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index ffb2da370a99..f671e49beb34 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -22,6 +22,8 @@ #ifndef OCFS2_AOPS_H #define OCFS2_AOPS_H +#include <linux/aio.h> + handle_t *ocfs2_start_walk_page_trans(struct inode *inode, struct page *page, unsigned from, diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index eeac97bb3bfa..b3fdd1a323d6 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1498,10 +1498,8 @@ leave: dlm_put(dlm); if (ret < 0) { - if (buf) - kfree(buf); - if (item) - kfree(item); + kfree(buf); + kfree(item); mlog_errno(ret); } diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 4c5fc8d77dc2..12bafb7265ce 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -640,6 +640,7 @@ static struct file_system_type dlmfs_fs_type = { .mount = dlmfs_mount, .kill_sb = kill_litter_super, }; +MODULE_ALIAS_FS("ocfs2_dlmfs"); static int __init init_dlmfs_fs(void) { diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 12ae194ac943..3a44a648dae7 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2322,7 +2322,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode, status = __ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags, subclass, _RET_IP_); if (status < 0) { - if (status != -EAGAIN && status != -EIOCBRETRY) + if (status != -EAGAIN) mlog_errno(status); goto bail; } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 6474cb44004d..8a7509f9e6f5 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2248,8 +2248,6 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, if (iocb->ki_left == 0) return 0; - sb_start_write(inode->i_sb); - appending = file->f_flags & O_APPEND ? 1 : 0; direct_io = file->f_flags & O_DIRECT ? 1 : 0; @@ -2423,7 +2421,6 @@ out_sems: ocfs2_iocb_clear_sem_locked(iocb); mutex_unlock(&inode->i_mutex); - sb_end_write(inode->i_sb); if (written) ret = written; @@ -2468,8 +2465,7 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, out->f_path.dentry->d_name.len, out->f_path.dentry->d_name.name, len); - if (pipe->inode) - mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT); + pipe_lock(pipe); splice_from_pipe_begin(&sd); do { @@ -2489,8 +2485,7 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, } while (ret > 0); splice_from_pipe_end(pipe, &sd); - if (pipe->inode) - mutex_unlock(&pipe->inode->i_mutex); + pipe_unlock(pipe); if (sd.num_spliced) ret = sd.num_spliced; diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 88924a3133fa..621fc73bf23d 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -147,8 +147,6 @@ void ocfs2_refresh_inode(struct inode *inode, int ocfs2_mark_inode_dirty(handle_t *handle, struct inode *inode, struct buffer_head *bh); -int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb); -int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb); struct buffer_head *ocfs2_bread(struct inode *inode, int block, int *err, int reada); diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 752f0b26221d..0c60ef2d8056 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -101,13 +101,6 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, if (!S_ISDIR(inode->i_mode)) flags &= ~OCFS2_DIRSYNC_FL; - handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); - if (IS_ERR(handle)) { - status = PTR_ERR(handle); - mlog_errno(status); - goto bail_unlock; - } - oldflags = ocfs2_inode->ip_attr; flags = flags & mask; flags |= oldflags & ~mask; @@ -120,7 +113,14 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, if ((oldflags & OCFS2_IMMUTABLE_FL) || ((flags ^ oldflags) & (OCFS2_APPEND_FL | OCFS2_IMMUTABLE_FL))) { if (!capable(CAP_LINUX_IMMUTABLE)) - goto bail_commit; + goto bail_unlock; + } + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto bail_unlock; } ocfs2_inode->ip_attr = flags; @@ -130,8 +130,8 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, if (status < 0) mlog_errno(status); -bail_commit: ocfs2_commit_trans(osb, handle); + bail_unlock: ocfs2_inode_unlock(inode, 1); bail: @@ -706,8 +706,10 @@ int ocfs2_info_handle_freefrag(struct inode *inode, o2info_set_request_filled(&oiff->iff_req); - if (o2info_to_user(*oiff, req)) + if (o2info_to_user(*oiff, req)) { + status = -EFAULT; goto bail; + } status = 0; bail: diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 9f8dcadd9a50..f1fc172175b6 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -471,7 +471,7 @@ static int ocfs2_validate_and_adjust_move_goal(struct inode *inode, int ret, goal_bit = 0; struct buffer_head *gd_bh = NULL; - struct ocfs2_group_desc *bg = NULL; + struct ocfs2_group_desc *bg; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); int c_to_b = 1 << (osb->s_clustersize_bits - inode->i_sb->s_blocksize_bits); @@ -482,13 +482,6 @@ static int ocfs2_validate_and_adjust_move_goal(struct inode *inode, range->me_goal = ocfs2_block_to_cluster_start(inode->i_sb, range->me_goal); /* - * moving goal is not allowd to start with a group desc blok(#0 blk) - * let's compromise to the latter cluster. - */ - if (range->me_goal == le64_to_cpu(bg->bg_blkno)) - range->me_goal += c_to_b; - - /* * validate goal sits within global_bitmap, and return the victim * group desc */ @@ -502,6 +495,13 @@ static int ocfs2_validate_and_adjust_move_goal(struct inode *inode, bg = (struct ocfs2_group_desc *)gd_bh->b_data; /* + * moving goal is not allowd to start with a group desc blok(#0 blk) + * let's compromise to the latter cluster. + */ + if (range->me_goal == le64_to_cpu(bg->bg_blkno)) + range->me_goal += c_to_b; + + /* * movement is not gonna cross two groups. */ if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize < @@ -1057,42 +1057,40 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp) struct inode *inode = file_inode(filp); struct ocfs2_move_extents range; - struct ocfs2_move_extents_context *context = NULL; + struct ocfs2_move_extents_context *context; + + if (!argp) + return -EINVAL; status = mnt_want_write_file(filp); if (status) return status; if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) - goto out; + goto out_drop; if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { status = -EPERM; - goto out; + goto out_drop; } context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS); if (!context) { status = -ENOMEM; mlog_errno(status); - goto out; + goto out_drop; } context->inode = inode; context->file = filp; - if (argp) { - if (copy_from_user(&range, argp, sizeof(range))) { - status = -EFAULT; - goto out; - } - } else { - status = -EINVAL; - goto out; + if (copy_from_user(&range, argp, sizeof(range))) { + status = -EFAULT; + goto out_free; } if (range.me_start > i_size_read(inode)) - goto out; + goto out_free; if (range.me_start + range.me_len > i_size_read(inode)) range.me_len = i_size_read(inode) - range.me_start; @@ -1124,25 +1122,24 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp) status = ocfs2_validate_and_adjust_move_goal(inode, &range); if (status) - goto out; + goto out_copy; } status = ocfs2_move_extents(context); if (status) mlog_errno(status); -out: +out_copy: /* * movement/defragmentation may end up being partially completed, * that's the reason why we need to return userspace the finished * length and new_offset even if failure happens somewhere. */ - if (argp) { - if (copy_to_user(argp, &range, sizeof(range))) - status = -EFAULT; - } + if (copy_to_user(argp, &range, sizeof(range))) + status = -EFAULT; +out_free: kfree(context); - +out_drop: mnt_drop_write_file(filp); return status; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 9b6910dec4ba..01b85165552b 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1266,6 +1266,7 @@ static struct file_system_type ocfs2_fs_type = { .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, .next = NULL }; +MODULE_ALIAS_FS("ocfs2"); static int ocfs2_check_set_options(struct super_block *sb, struct mount_options *options) diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index 25d715c7c87a..d8b0afde2179 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -572,6 +572,7 @@ static struct file_system_type omfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("omfs"); static int __init init_omfs_fs(void) { diff --git a/fs/open.c b/fs/open.c index 68354466879f..8c741002f947 100644 --- a/fs/open.c +++ b/fs/open.c @@ -197,10 +197,7 @@ out: SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length) { - long ret = do_sys_ftruncate(fd, length, 1); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(2, ret, fd, length); - return ret; + return do_sys_ftruncate(fd, length, 1); } #ifdef CONFIG_COMPAT @@ -212,32 +209,15 @@ COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length) /* LFS versions of truncate are only needed on 32 bit machines */ #if BITS_PER_LONG == 32 -SYSCALL_DEFINE(truncate64)(const char __user * path, loff_t length) +SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length) { return do_sys_truncate(path, length); } -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_truncate64(long path, loff_t length) -{ - return SYSC_truncate64((const char __user *) path, length); -} -SYSCALL_ALIAS(sys_truncate64, SyS_truncate64); -#endif -SYSCALL_DEFINE(ftruncate64)(unsigned int fd, loff_t length) +SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length) { - long ret = do_sys_ftruncate(fd, length, 0); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(2, ret, fd, length); - return ret; -} -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_ftruncate64(long fd, loff_t length) -{ - return SYSC_ftruncate64((unsigned int) fd, length); + return do_sys_ftruncate(fd, length, 0); } -SYSCALL_ALIAS(sys_ftruncate64, SyS_ftruncate64); -#endif #endif /* BITS_PER_LONG == 32 */ @@ -299,7 +279,7 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) return ret; } -SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len) +SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) { struct fd f = fdget(fd); int error = -EBADF; @@ -311,14 +291,6 @@ SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len) return error; } -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_fallocate(long fd, long mode, loff_t offset, loff_t len) -{ - return SYSC_fallocate((int)fd, (int)mode, offset, len); -} -SYSCALL_ALIAS(sys_fallocate, SyS_fallocate); -#endif - /* * access() needs to use the real uid/gid, not the effective uid/gid. * We do this by temporarily clearing all FS-related capabilities and @@ -983,29 +955,19 @@ long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) { - long ret; - if (force_o_largefile()) flags |= O_LARGEFILE; - ret = do_sys_open(AT_FDCWD, filename, flags, mode); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(3, ret, filename, flags, mode); - return ret; + return do_sys_open(AT_FDCWD, filename, flags, mode); } SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode) { - long ret; - if (force_o_largefile()) flags |= O_LARGEFILE; - ret = do_sys_open(dfd, filename, flags, mode); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(4, ret, dfd, filename, flags, mode); - return ret; + return do_sys_open(dfd, filename, flags, mode); } #ifndef __alpha__ diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index ae47fa7efb9d..75885ffde44e 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -432,6 +432,7 @@ static struct file_system_type openprom_fs_type = { .mount = openprom_mount, .kill_sb = kill_anon_super, }; +MODULE_ALIAS_FS("openpromfs"); static void op_inode_init_once(void *data) { diff --git a/fs/pipe.c b/fs/pipe.c index 64a494cef0a0..d2c45e14e6d8 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -21,10 +21,13 @@ #include <linux/audit.h> #include <linux/syscalls.h> #include <linux/fcntl.h> +#include <linux/aio.h> #include <asm/uaccess.h> #include <asm/ioctls.h> +#include "internal.h" + /* * The max size that a non-root user is allowed to grow the pipe. Can * be set by root in /proc/sys/fs/pipe-max-size @@ -53,8 +56,8 @@ unsigned int pipe_min_size = PAGE_SIZE; static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) { - if (pipe->inode) - mutex_lock_nested(&pipe->inode->i_mutex, subclass); + if (pipe->files) + mutex_lock_nested(&pipe->mutex, subclass); } void pipe_lock(struct pipe_inode_info *pipe) @@ -68,11 +71,21 @@ EXPORT_SYMBOL(pipe_lock); void pipe_unlock(struct pipe_inode_info *pipe) { - if (pipe->inode) - mutex_unlock(&pipe->inode->i_mutex); + if (pipe->files) + mutex_unlock(&pipe->mutex); } EXPORT_SYMBOL(pipe_unlock); +static inline void __pipe_lock(struct pipe_inode_info *pipe) +{ + mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT); +} + +static inline void __pipe_unlock(struct pipe_inode_info *pipe) +{ + mutex_unlock(&pipe->mutex); +} + void pipe_double_lock(struct pipe_inode_info *pipe1, struct pipe_inode_info *pipe2) { @@ -361,8 +374,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov, unsigned long nr_segs, loff_t pos) { struct file *filp = iocb->ki_filp; - struct inode *inode = file_inode(filp); - struct pipe_inode_info *pipe; + struct pipe_inode_info *pipe = filp->private_data; int do_wakeup; ssize_t ret; struct iovec *iov = (struct iovec *)_iov; @@ -375,8 +387,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov, do_wakeup = 0; ret = 0; - mutex_lock(&inode->i_mutex); - pipe = inode->i_pipe; + __pipe_lock(pipe); for (;;) { int bufs = pipe->nrbufs; if (bufs) { @@ -464,7 +475,7 @@ redo: } pipe_wait(pipe); } - mutex_unlock(&inode->i_mutex); + __pipe_unlock(pipe); /* Signal writers asynchronously that there is more room. */ if (do_wakeup) { @@ -486,8 +497,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov, unsigned long nr_segs, loff_t ppos) { struct file *filp = iocb->ki_filp; - struct inode *inode = file_inode(filp); - struct pipe_inode_info *pipe; + struct pipe_inode_info *pipe = filp->private_data; ssize_t ret; int do_wakeup; struct iovec *iov = (struct iovec *)_iov; @@ -501,8 +511,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov, do_wakeup = 0; ret = 0; - mutex_lock(&inode->i_mutex); - pipe = inode->i_pipe; + __pipe_lock(pipe); if (!pipe->readers) { send_sig(SIGPIPE, current, 0); @@ -649,7 +658,7 @@ redo2: pipe->waiting_writers--; } out: - mutex_unlock(&inode->i_mutex); + __pipe_unlock(pipe); if (do_wakeup) { wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); @@ -662,29 +671,14 @@ out: return ret; } -static ssize_t -bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos) -{ - return -EBADF; -} - -static ssize_t -bad_pipe_w(struct file *filp, const char __user *buf, size_t count, - loff_t *ppos) -{ - return -EBADF; -} - static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - struct inode *inode = file_inode(filp); - struct pipe_inode_info *pipe; + struct pipe_inode_info *pipe = filp->private_data; int count, buf, nrbufs; switch (cmd) { case FIONREAD: - mutex_lock(&inode->i_mutex); - pipe = inode->i_pipe; + __pipe_lock(pipe); count = 0; buf = pipe->curbuf; nrbufs = pipe->nrbufs; @@ -692,7 +686,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) count += pipe->bufs[buf].len; buf = (buf+1) & (pipe->buffers - 1); } - mutex_unlock(&inode->i_mutex); + __pipe_unlock(pipe); return put_user(count, (int __user *)arg); default: @@ -705,8 +699,7 @@ static unsigned int pipe_poll(struct file *filp, poll_table *wait) { unsigned int mask; - struct inode *inode = file_inode(filp); - struct pipe_inode_info *pipe = inode->i_pipe; + struct pipe_inode_info *pipe = filp->private_data; int nrbufs; poll_wait(filp, &pipe->wait, wait); @@ -734,194 +727,56 @@ pipe_poll(struct file *filp, poll_table *wait) } static int -pipe_release(struct inode *inode, int decr, int decw) +pipe_release(struct inode *inode, struct file *file) { - struct pipe_inode_info *pipe; + struct pipe_inode_info *pipe = inode->i_pipe; + int kill = 0; - mutex_lock(&inode->i_mutex); - pipe = inode->i_pipe; - pipe->readers -= decr; - pipe->writers -= decw; + __pipe_lock(pipe); + if (file->f_mode & FMODE_READ) + pipe->readers--; + if (file->f_mode & FMODE_WRITE) + pipe->writers--; - if (!pipe->readers && !pipe->writers) { - free_pipe_info(inode); - } else { + if (pipe->readers || pipe->writers) { wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } - mutex_unlock(&inode->i_mutex); - - return 0; -} - -static int -pipe_read_fasync(int fd, struct file *filp, int on) -{ - struct inode *inode = file_inode(filp); - int retval; - - mutex_lock(&inode->i_mutex); - retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers); - mutex_unlock(&inode->i_mutex); - - return retval; -} - - -static int -pipe_write_fasync(int fd, struct file *filp, int on) -{ - struct inode *inode = file_inode(filp); - int retval; + spin_lock(&inode->i_lock); + if (!--pipe->files) { + inode->i_pipe = NULL; + kill = 1; + } + spin_unlock(&inode->i_lock); + __pipe_unlock(pipe); - mutex_lock(&inode->i_mutex); - retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers); - mutex_unlock(&inode->i_mutex); + if (kill) + free_pipe_info(pipe); - return retval; + return 0; } - static int -pipe_rdwr_fasync(int fd, struct file *filp, int on) +pipe_fasync(int fd, struct file *filp, int on) { - struct inode *inode = file_inode(filp); - struct pipe_inode_info *pipe = inode->i_pipe; - int retval; + struct pipe_inode_info *pipe = filp->private_data; + int retval = 0; - mutex_lock(&inode->i_mutex); - retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); - if (retval >= 0) { + __pipe_lock(pipe); + if (filp->f_mode & FMODE_READ) + retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); + if ((filp->f_mode & FMODE_WRITE) && retval >= 0) { retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); - if (retval < 0) /* this can happen only if on == T */ + if (retval < 0 && (filp->f_mode & FMODE_READ)) + /* this can happen only if on == T */ fasync_helper(-1, filp, 0, &pipe->fasync_readers); } - mutex_unlock(&inode->i_mutex); + __pipe_unlock(pipe); return retval; } - -static int -pipe_read_release(struct inode *inode, struct file *filp) -{ - return pipe_release(inode, 1, 0); -} - -static int -pipe_write_release(struct inode *inode, struct file *filp) -{ - return pipe_release(inode, 0, 1); -} - -static int -pipe_rdwr_release(struct inode *inode, struct file *filp) -{ - int decr, decw; - - decr = (filp->f_mode & FMODE_READ) != 0; - decw = (filp->f_mode & FMODE_WRITE) != 0; - return pipe_release(inode, decr, decw); -} - -static int -pipe_read_open(struct inode *inode, struct file *filp) -{ - int ret = -ENOENT; - - mutex_lock(&inode->i_mutex); - - if (inode->i_pipe) { - ret = 0; - inode->i_pipe->readers++; - } - - mutex_unlock(&inode->i_mutex); - - return ret; -} - -static int -pipe_write_open(struct inode *inode, struct file *filp) -{ - int ret = -ENOENT; - - mutex_lock(&inode->i_mutex); - - if (inode->i_pipe) { - ret = 0; - inode->i_pipe->writers++; - } - - mutex_unlock(&inode->i_mutex); - - return ret; -} - -static int -pipe_rdwr_open(struct inode *inode, struct file *filp) -{ - int ret = -ENOENT; - - mutex_lock(&inode->i_mutex); - - if (inode->i_pipe) { - ret = 0; - if (filp->f_mode & FMODE_READ) - inode->i_pipe->readers++; - if (filp->f_mode & FMODE_WRITE) - inode->i_pipe->writers++; - } - - mutex_unlock(&inode->i_mutex); - - return ret; -} - -/* - * The file_operations structs are not static because they - * are also used in linux/fs/fifo.c to do operations on FIFOs. - * - * Pipes reuse fifos' file_operations structs. - */ -const struct file_operations read_pipefifo_fops = { - .llseek = no_llseek, - .read = do_sync_read, - .aio_read = pipe_read, - .write = bad_pipe_w, - .poll = pipe_poll, - .unlocked_ioctl = pipe_ioctl, - .open = pipe_read_open, - .release = pipe_read_release, - .fasync = pipe_read_fasync, -}; - -const struct file_operations write_pipefifo_fops = { - .llseek = no_llseek, - .read = bad_pipe_r, - .write = do_sync_write, - .aio_write = pipe_write, - .poll = pipe_poll, - .unlocked_ioctl = pipe_ioctl, - .open = pipe_write_open, - .release = pipe_write_release, - .fasync = pipe_write_fasync, -}; - -const struct file_operations rdwr_pipefifo_fops = { - .llseek = no_llseek, - .read = do_sync_read, - .aio_read = pipe_read, - .write = do_sync_write, - .aio_write = pipe_write, - .poll = pipe_poll, - .unlocked_ioctl = pipe_ioctl, - .open = pipe_rdwr_open, - .release = pipe_rdwr_release, - .fasync = pipe_rdwr_fasync, -}; - -struct pipe_inode_info * alloc_pipe_info(struct inode *inode) +struct pipe_inode_info *alloc_pipe_info(void) { struct pipe_inode_info *pipe; @@ -931,8 +786,8 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode) if (pipe->bufs) { init_waitqueue_head(&pipe->wait); pipe->r_counter = pipe->w_counter = 1; - pipe->inode = inode; pipe->buffers = PIPE_DEF_BUFFERS; + mutex_init(&pipe->mutex); return pipe; } kfree(pipe); @@ -941,7 +796,7 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode) return NULL; } -void __free_pipe_info(struct pipe_inode_info *pipe) +void free_pipe_info(struct pipe_inode_info *pipe) { int i; @@ -956,12 +811,6 @@ void __free_pipe_info(struct pipe_inode_info *pipe) kfree(pipe); } -void free_pipe_info(struct inode *inode) -{ - __free_pipe_info(inode->i_pipe); - inode->i_pipe = NULL; -} - static struct vfsmount *pipe_mnt __read_mostly; /* @@ -987,13 +836,14 @@ static struct inode * get_pipe_inode(void) inode->i_ino = get_next_ino(); - pipe = alloc_pipe_info(inode); + pipe = alloc_pipe_info(); if (!pipe) goto fail_iput; - inode->i_pipe = pipe; + inode->i_pipe = pipe; + pipe->files = 2; pipe->readers = pipe->writers = 1; - inode->i_fop = &rdwr_pipefifo_fops; + inode->i_fop = &pipefifo_fops; /* * Mark the inode dirty from the very beginning, @@ -1036,17 +886,19 @@ int create_pipe_files(struct file **res, int flags) d_instantiate(path.dentry, inode); err = -ENFILE; - f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops); + f = alloc_file(&path, FMODE_WRITE, &pipefifo_fops); if (IS_ERR(f)) goto err_dentry; f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)); + f->private_data = inode->i_pipe; - res[0] = alloc_file(&path, FMODE_READ, &read_pipefifo_fops); + res[0] = alloc_file(&path, FMODE_READ, &pipefifo_fops); if (IS_ERR(res[0])) goto err_file; path_get(&path); + res[0]->private_data = inode->i_pipe; res[0]->f_flags = O_RDONLY | (flags & O_NONBLOCK); res[1] = f; return 0; @@ -1054,12 +906,12 @@ int create_pipe_files(struct file **res, int flags) err_file: put_filp(f); err_dentry: - free_pipe_info(inode); + free_pipe_info(inode->i_pipe); path_put(&path); return err; err_inode: - free_pipe_info(inode); + free_pipe_info(inode->i_pipe); iput(inode); return err; } @@ -1141,6 +993,168 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes) return sys_pipe2(fildes, 0); } +static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) +{ + int cur = *cnt; + + while (cur == *cnt) { + pipe_wait(pipe); + if (signal_pending(current)) + break; + } + return cur == *cnt ? -ERESTARTSYS : 0; +} + +static void wake_up_partner(struct pipe_inode_info *pipe) +{ + wake_up_interruptible(&pipe->wait); +} + +static int fifo_open(struct inode *inode, struct file *filp) +{ + struct pipe_inode_info *pipe; + bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC; + int kill = 0; + int ret; + + filp->f_version = 0; + + spin_lock(&inode->i_lock); + if (inode->i_pipe) { + pipe = inode->i_pipe; + pipe->files++; + spin_unlock(&inode->i_lock); + } else { + spin_unlock(&inode->i_lock); + pipe = alloc_pipe_info(); + if (!pipe) + return -ENOMEM; + pipe->files = 1; + spin_lock(&inode->i_lock); + if (unlikely(inode->i_pipe)) { + inode->i_pipe->files++; + spin_unlock(&inode->i_lock); + free_pipe_info(pipe); + pipe = inode->i_pipe; + } else { + inode->i_pipe = pipe; + spin_unlock(&inode->i_lock); + } + } + filp->private_data = pipe; + /* OK, we have a pipe and it's pinned down */ + + __pipe_lock(pipe); + + /* We can only do regular read/write on fifos */ + filp->f_mode &= (FMODE_READ | FMODE_WRITE); + + switch (filp->f_mode) { + case FMODE_READ: + /* + * O_RDONLY + * POSIX.1 says that O_NONBLOCK means return with the FIFO + * opened, even when there is no process writing the FIFO. + */ + pipe->r_counter++; + if (pipe->readers++ == 0) + wake_up_partner(pipe); + + if (!is_pipe && !pipe->writers) { + if ((filp->f_flags & O_NONBLOCK)) { + /* suppress POLLHUP until we have + * seen a writer */ + filp->f_version = pipe->w_counter; + } else { + if (wait_for_partner(pipe, &pipe->w_counter)) + goto err_rd; + } + } + break; + + case FMODE_WRITE: + /* + * O_WRONLY + * POSIX.1 says that O_NONBLOCK means return -1 with + * errno=ENXIO when there is no process reading the FIFO. + */ + ret = -ENXIO; + if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers) + goto err; + + pipe->w_counter++; + if (!pipe->writers++) + wake_up_partner(pipe); + + if (!is_pipe && !pipe->readers) { + if (wait_for_partner(pipe, &pipe->r_counter)) + goto err_wr; + } + break; + + case FMODE_READ | FMODE_WRITE: + /* + * O_RDWR + * POSIX.1 leaves this case "undefined" when O_NONBLOCK is set. + * This implementation will NEVER block on a O_RDWR open, since + * the process can at least talk to itself. + */ + + pipe->readers++; + pipe->writers++; + pipe->r_counter++; + pipe->w_counter++; + if (pipe->readers == 1 || pipe->writers == 1) + wake_up_partner(pipe); + break; + + default: + ret = -EINVAL; + goto err; + } + + /* Ok! */ + __pipe_unlock(pipe); + return 0; + +err_rd: + if (!--pipe->readers) + wake_up_interruptible(&pipe->wait); + ret = -ERESTARTSYS; + goto err; + +err_wr: + if (!--pipe->writers) + wake_up_interruptible(&pipe->wait); + ret = -ERESTARTSYS; + goto err; + +err: + spin_lock(&inode->i_lock); + if (!--pipe->files) { + inode->i_pipe = NULL; + kill = 1; + } + spin_unlock(&inode->i_lock); + __pipe_unlock(pipe); + if (kill) + free_pipe_info(pipe); + return ret; +} + +const struct file_operations pipefifo_fops = { + .open = fifo_open, + .llseek = no_llseek, + .read = do_sync_read, + .aio_read = pipe_read, + .write = do_sync_write, + .aio_write = pipe_write, + .poll = pipe_poll, + .unlocked_ioctl = pipe_ioctl, + .release = pipe_release, + .fasync = pipe_fasync, +}; + /* * Allocate a new array of pipe buffers and copy the info over. Returns the * pipe size if successful, or return -ERROR on error. @@ -1226,9 +1240,7 @@ int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, */ struct pipe_inode_info *get_pipe_info(struct file *file) { - struct inode *i = file_inode(file); - - return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL; + return file->f_op == &pipefifo_fops ? file->private_data : NULL; } long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) @@ -1240,7 +1252,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) if (!pipe) return -EBADF; - mutex_lock(&pipe->inode->i_mutex); + __pipe_lock(pipe); switch (cmd) { case F_SETPIPE_SZ: { @@ -1269,7 +1281,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) } out: - mutex_unlock(&pipe->inode->i_mutex); + __pipe_unlock(pipe); return ret; } diff --git a/fs/pnode.c b/fs/pnode.c index 3e000a51ac0d..3d2a7141b87a 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -9,6 +9,7 @@ #include <linux/mnt_namespace.h> #include <linux/mount.h> #include <linux/fs.h> +#include <linux/nsproxy.h> #include "internal.h" #include "pnode.h" @@ -217,15 +218,15 @@ static struct mount *get_source(struct mount *dest, * @source_mnt: source mount. * @tree_list : list of heads of trees to be attached. */ -int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry, +int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, struct mount *source_mnt, struct list_head *tree_list) { + struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; struct mount *m, *child; int ret = 0; struct mount *prev_dest_mnt = dest_mnt; struct mount *prev_src_mnt = source_mnt; LIST_HEAD(tmp_list); - LIST_HEAD(umount_list); for (m = propagation_next(dest_mnt, dest_mnt); m; m = propagation_next(m, dest_mnt)) { @@ -237,6 +238,10 @@ int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry, source = get_source(m, prev_dest_mnt, prev_src_mnt, &type); + /* Notice when we are propagating across user namespaces */ + if (m->mnt_ns->user_ns != user_ns) + type |= CL_UNPRIVILEGED; + child = copy_tree(source, source->mnt.mnt_root, type); if (IS_ERR(child)) { ret = PTR_ERR(child); @@ -244,8 +249,8 @@ int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry, goto out; } - if (is_subdir(dest_dentry, m->mnt.mnt_root)) { - mnt_set_mountpoint(m, dest_dentry, child); + if (is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) { + mnt_set_mountpoint(m, dest_mp, child); list_add_tail(&child->mnt_hash, tree_list); } else { /* @@ -261,10 +266,9 @@ out: br_write_lock(&vfsmount_lock); while (!list_empty(&tmp_list)) { child = list_first_entry(&tmp_list, struct mount, mnt_hash); - umount_tree(child, 0, &umount_list); + umount_tree(child, 0); } br_write_unlock(&vfsmount_lock); - release_mounts(&umount_list); return ret; } diff --git a/fs/pnode.h b/fs/pnode.h index 19b853a3445c..b091445c1c4a 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -23,6 +23,7 @@ #define CL_MAKE_SHARED 0x08 #define CL_PRIVATE 0x10 #define CL_SHARED_TO_SLAVE 0x20 +#define CL_UNPRIVILEGED 0x40 static inline void set_mnt_shared(struct mount *mnt) { @@ -31,17 +32,16 @@ static inline void set_mnt_shared(struct mount *mnt) } void change_mnt_propagation(struct mount *, int); -int propagate_mnt(struct mount *, struct dentry *, struct mount *, +int propagate_mnt(struct mount *, struct mountpoint *, struct mount *, struct list_head *); int propagate_umount(struct list_head *); int propagate_mount_busy(struct mount *, int); void mnt_release_group_id(struct mount *); int get_dominating_id(struct mount *mnt, const struct path *root); unsigned int mnt_get_count(struct mount *mnt); -void mnt_set_mountpoint(struct mount *, struct dentry *, +void mnt_set_mountpoint(struct mount *, struct mountpoint *, struct mount *); -void release_mounts(struct list_head *); -void umount_tree(struct mount *, int, struct list_head *); +void umount_tree(struct mount *, int); struct mount *copy_tree(struct mount *, struct dentry *, int); bool is_path_reachable(struct mount *, struct dentry *, const struct path *root); diff --git a/fs/proc/Makefile b/fs/proc/Makefile index 712f24db9600..ab30716584f5 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -5,7 +5,7 @@ obj-y += proc.o proc-y := nommu.o task_nommu.o -proc-$(CONFIG_MMU) := mmu.o task_mmu.o +proc-$(CONFIG_MMU) := task_mmu.o proc-y += inode.o root.o base.o generic.o array.o \ fd.o diff --git a/fs/proc/array.c b/fs/proc/array.c index f7ed9ee46eb9..cbd0f1b324b9 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -143,6 +143,7 @@ static const char * const task_state_array[] = { "x (dead)", /* 64 */ "K (wakekill)", /* 128 */ "W (waking)", /* 256 */ + "P (parked)", /* 512 */ }; static inline const char *get_task_state(struct task_struct *tsk) diff --git a/fs/proc/base.c b/fs/proc/base.c index 69078c7cef1f..dd51e50001fe 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -86,6 +86,7 @@ #include <linux/fs_struct.h> #include <linux/slab.h> #include <linux/flex_array.h> +#include <linux/posix-timers.h> #ifdef CONFIG_HARDWALL #include <asm/hardwall.h> #endif @@ -404,6 +405,37 @@ static const struct file_operations proc_lstats_operations = { #endif +#ifdef CONFIG_CGROUPS +static int cgroup_open(struct inode *inode, struct file *file) +{ + struct pid *pid = PROC_I(inode)->pid; + return single_open(file, proc_cgroup_show, pid); +} + +static const struct file_operations proc_cgroup_operations = { + .open = cgroup_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + +#ifdef CONFIG_PROC_PID_CPUSET + +static int cpuset_open(struct inode *inode, struct file *file) +{ + struct pid *pid = PROC_I(inode)->pid; + return single_open(file, proc_cpuset_show, pid); +} + +static const struct file_operations proc_cpuset_operations = { + .open = cpuset_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + static int proc_oom_score(struct task_struct *task, char *buffer) { unsigned long totalpages = totalram_pages + total_swap_pages; @@ -1347,11 +1379,10 @@ static ssize_t comm_write(struct file *file, const char __user *buf, struct inode *inode = file_inode(file); struct task_struct *p; char buffer[TASK_COMM_LEN]; + const size_t maxlen = sizeof(buffer) - 1; memset(buffer, 0, sizeof(buffer)); - if (count > sizeof(buffer) - 1) - count = sizeof(buffer) - 1; - if (copy_from_user(buffer, buf, count)) + if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count)) return -EFAULT; p = get_proc_task(inode); @@ -1621,6 +1652,15 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags) return 0; } +int pid_delete_dentry(const struct dentry *dentry) +{ + /* Is the task we represent dead? + * If so, then don't put the dentry on the lru list, + * kill it immediately. + */ + return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first; +} + const struct dentry_operations pid_dentry_operations = { .d_revalidate = pid_revalidate, @@ -2013,6 +2053,102 @@ static const struct file_operations proc_map_files_operations = { .llseek = default_llseek, }; +struct timers_private { + struct pid *pid; + struct task_struct *task; + struct sighand_struct *sighand; + struct pid_namespace *ns; + unsigned long flags; +}; + +static void *timers_start(struct seq_file *m, loff_t *pos) +{ + struct timers_private *tp = m->private; + + tp->task = get_pid_task(tp->pid, PIDTYPE_PID); + if (!tp->task) + return ERR_PTR(-ESRCH); + + tp->sighand = lock_task_sighand(tp->task, &tp->flags); + if (!tp->sighand) + return ERR_PTR(-ESRCH); + + return seq_list_start(&tp->task->signal->posix_timers, *pos); +} + +static void *timers_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct timers_private *tp = m->private; + return seq_list_next(v, &tp->task->signal->posix_timers, pos); +} + +static void timers_stop(struct seq_file *m, void *v) +{ + struct timers_private *tp = m->private; + + if (tp->sighand) { + unlock_task_sighand(tp->task, &tp->flags); + tp->sighand = NULL; + } + + if (tp->task) { + put_task_struct(tp->task); + tp->task = NULL; + } +} + +static int show_timer(struct seq_file *m, void *v) +{ + struct k_itimer *timer; + struct timers_private *tp = m->private; + int notify; + static char *nstr[] = { + [SIGEV_SIGNAL] = "signal", + [SIGEV_NONE] = "none", + [SIGEV_THREAD] = "thread", + }; + + timer = list_entry((struct list_head *)v, struct k_itimer, list); + notify = timer->it_sigev_notify; + + seq_printf(m, "ID: %d\n", timer->it_id); + seq_printf(m, "signal: %d/%p\n", timer->sigq->info.si_signo, + timer->sigq->info.si_value.sival_ptr); + seq_printf(m, "notify: %s/%s.%d\n", + nstr[notify & ~SIGEV_THREAD_ID], + (notify & SIGEV_THREAD_ID) ? "tid" : "pid", + pid_nr_ns(timer->it_pid, tp->ns)); + + return 0; +} + +static const struct seq_operations proc_timers_seq_ops = { + .start = timers_start, + .next = timers_next, + .stop = timers_stop, + .show = show_timer, +}; + +static int proc_timers_open(struct inode *inode, struct file *file) +{ + struct timers_private *tp; + + tp = __seq_open_private(file, &proc_timers_seq_ops, + sizeof(struct timers_private)); + if (!tp) + return -ENOMEM; + + tp->pid = proc_pid(inode); + tp->ns = inode->i_sb->s_fs_info; + return 0; +} + +static const struct file_operations proc_timers_operations = { + .open = proc_timers_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; #endif /* CONFIG_CHECKPOINT_RESTORE */ static struct dentry *proc_pident_instantiate(struct inode *dir, @@ -2583,6 +2719,9 @@ static const struct pid_entry tgid_base_stuff[] = { REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), #endif +#ifdef CONFIG_CHECKPOINT_RESTORE + REG("timers", S_IRUGO, proc_timers_operations), +#endif }; static int proc_tgid_base_readdir(struct file * filp, @@ -2794,7 +2933,7 @@ retry: return iter; } -#define TGID_OFFSET (FIRST_PROCESS_ENTRY) +#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 1) static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir, struct tgid_iter iter) @@ -2817,13 +2956,21 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) struct tgid_iter iter; struct pid_namespace *ns; filldir_t __filldir; + loff_t pos = filp->f_pos; - if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET) + if (pos >= PID_MAX_LIMIT + TGID_OFFSET) goto out; - ns = filp->f_dentry->d_sb->s_fs_info; + if (pos == TGID_OFFSET - 1) { + if (proc_fill_cache(filp, dirent, filldir, "self", 4, + NULL, NULL, NULL) < 0) + goto out; + iter.tgid = 0; + } else { + iter.tgid = pos - TGID_OFFSET; + } iter.task = NULL; - iter.tgid = filp->f_pos - TGID_OFFSET; + ns = filp->f_dentry->d_sb->s_fs_info; for (iter = next_tgid(ns, iter); iter.task; iter.tgid += 1, iter = next_tgid(ns, iter)) { diff --git a/fs/proc/fd.h b/fs/proc/fd.h index cbb1d47deda8..7c047f256ae2 100644 --- a/fs/proc/fd.h +++ b/fs/proc/fd.h @@ -11,4 +11,9 @@ extern const struct inode_operations proc_fdinfo_inode_operations; extern int proc_fd_permission(struct inode *inode, int mask); +static inline int proc_fd(struct inode *inode) +{ + return PROC_I(inode)->fd; +} + #endif /* __PROCFS_FD_H__ */ diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 4b3b3ffb52f1..a2596afffae6 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -36,212 +36,6 @@ static int proc_match(unsigned int len, const char *name, struct proc_dir_entry return !memcmp(name, de->name, len); } -/* buffer size is one page but our output routines use some slack for overruns */ -#define PROC_BLOCK_SIZE (PAGE_SIZE - 1024) - -static ssize_t -__proc_file_read(struct file *file, char __user *buf, size_t nbytes, - loff_t *ppos) -{ - struct inode * inode = file_inode(file); - char *page; - ssize_t retval=0; - int eof=0; - ssize_t n, count; - char *start; - struct proc_dir_entry * dp; - unsigned long long pos; - - /* - * Gaah, please just use "seq_file" instead. The legacy /proc - * interfaces cut loff_t down to off_t for reads, and ignore - * the offset entirely for writes.. - */ - pos = *ppos; - if (pos > MAX_NON_LFS) - return 0; - if (nbytes > MAX_NON_LFS - pos) - nbytes = MAX_NON_LFS - pos; - - dp = PDE(inode); - if (!(page = (char*) __get_free_page(GFP_TEMPORARY))) - return -ENOMEM; - - while ((nbytes > 0) && !eof) { - count = min_t(size_t, PROC_BLOCK_SIZE, nbytes); - - start = NULL; - if (dp->read_proc) { - /* - * How to be a proc read function - * ------------------------------ - * Prototype: - * int f(char *buffer, char **start, off_t offset, - * int count, int *peof, void *dat) - * - * Assume that the buffer is "count" bytes in size. - * - * If you know you have supplied all the data you - * have, set *peof. - * - * You have three ways to return data: - * 0) Leave *start = NULL. (This is the default.) - * Put the data of the requested offset at that - * offset within the buffer. Return the number (n) - * of bytes there are from the beginning of the - * buffer up to the last byte of data. If the - * number of supplied bytes (= n - offset) is - * greater than zero and you didn't signal eof - * and the reader is prepared to take more data - * you will be called again with the requested - * offset advanced by the number of bytes - * absorbed. This interface is useful for files - * no larger than the buffer. - * 1) Set *start = an unsigned long value less than - * the buffer address but greater than zero. - * Put the data of the requested offset at the - * beginning of the buffer. Return the number of - * bytes of data placed there. If this number is - * greater than zero and you didn't signal eof - * and the reader is prepared to take more data - * you will be called again with the requested - * offset advanced by *start. This interface is - * useful when you have a large file consisting - * of a series of blocks which you want to count - * and return as wholes. - * (Hack by Paul.Russell@rustcorp.com.au) - * 2) Set *start = an address within the buffer. - * Put the data of the requested offset at *start. - * Return the number of bytes of data placed there. - * If this number is greater than zero and you - * didn't signal eof and the reader is prepared to - * take more data you will be called again with the - * requested offset advanced by the number of bytes - * absorbed. - */ - n = dp->read_proc(page, &start, *ppos, - count, &eof, dp->data); - } else - break; - - if (n == 0) /* end of file */ - break; - if (n < 0) { /* error */ - if (retval == 0) - retval = n; - break; - } - - if (start == NULL) { - if (n > PAGE_SIZE) /* Apparent buffer overflow */ - n = PAGE_SIZE; - n -= *ppos; - if (n <= 0) - break; - if (n > count) - n = count; - start = page + *ppos; - } else if (start < page) { - if (n > PAGE_SIZE) /* Apparent buffer overflow */ - n = PAGE_SIZE; - if (n > count) { - /* - * Don't reduce n because doing so might - * cut off part of a data block. - */ - pr_warn("proc_file_read: count exceeded\n"); - } - } else /* start >= page */ { - unsigned long startoff = (unsigned long)(start - page); - if (n > (PAGE_SIZE - startoff)) /* buffer overflow? */ - n = PAGE_SIZE - startoff; - if (n > count) - n = count; - } - - n -= copy_to_user(buf, start < page ? page : start, n); - if (n == 0) { - if (retval == 0) - retval = -EFAULT; - break; - } - - *ppos += start < page ? (unsigned long)start : n; - nbytes -= n; - buf += n; - retval += n; - } - free_page((unsigned long) page); - return retval; -} - -static ssize_t -proc_file_read(struct file *file, char __user *buf, size_t nbytes, - loff_t *ppos) -{ - struct proc_dir_entry *pde = PDE(file_inode(file)); - ssize_t rv = -EIO; - - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; - } - pde->pde_users++; - spin_unlock(&pde->pde_unload_lock); - - rv = __proc_file_read(file, buf, nbytes, ppos); - - pde_users_dec(pde); - return rv; -} - -static ssize_t -proc_file_write(struct file *file, const char __user *buffer, - size_t count, loff_t *ppos) -{ - struct proc_dir_entry *pde = PDE(file_inode(file)); - ssize_t rv = -EIO; - - if (pde->write_proc) { - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; - } - pde->pde_users++; - spin_unlock(&pde->pde_unload_lock); - - /* FIXME: does this routine need ppos? probably... */ - rv = pde->write_proc(file, buffer, count, pde->data); - pde_users_dec(pde); - } - return rv; -} - - -static loff_t -proc_file_lseek(struct file *file, loff_t offset, int orig) -{ - loff_t retval = -EINVAL; - switch (orig) { - case 1: - offset += file->f_pos; - /* fallthrough */ - case 0: - if (offset < 0 || offset > MAX_NON_LFS) - break; - file->f_pos = retval = offset; - } - return retval; -} - -static const struct file_operations proc_file_operations = { - .llseek = proc_file_lseek, - .read = proc_file_read, - .write = proc_file_write, -}; - static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) { struct inode *inode = dentry->d_inode; @@ -371,7 +165,7 @@ void proc_free_inum(unsigned int inum) static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd) { - nd_set_link(nd, PDE(dentry->d_inode)->data); + nd_set_link(nd, __PDE_DATA(dentry->d_inode)); return NULL; } @@ -541,19 +335,17 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp return ret; if (S_ISDIR(dp->mode)) { - if (dp->proc_iops == NULL) { - dp->proc_fops = &proc_dir_operations; - dp->proc_iops = &proc_dir_inode_operations; - } + dp->proc_fops = &proc_dir_operations; + dp->proc_iops = &proc_dir_inode_operations; dir->nlink++; } else if (S_ISLNK(dp->mode)) { - if (dp->proc_iops == NULL) - dp->proc_iops = &proc_link_inode_operations; + dp->proc_iops = &proc_link_inode_operations; } else if (S_ISREG(dp->mode)) { - if (dp->proc_fops == NULL) - dp->proc_fops = &proc_file_operations; - if (dp->proc_iops == NULL) - dp->proc_iops = &proc_file_inode_operations; + BUG_ON(dp->proc_fops == NULL); + dp->proc_iops = &proc_file_inode_operations; + } else { + WARN_ON(1); + return -EINVAL; } spin_lock(&proc_subdir_lock); @@ -636,13 +428,17 @@ struct proc_dir_entry *proc_symlink(const char *name, } EXPORT_SYMBOL(proc_symlink); -struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode, - struct proc_dir_entry *parent) +struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode, + struct proc_dir_entry *parent, void *data) { struct proc_dir_entry *ent; + if (mode == 0) + mode = S_IRUGO | S_IXUGO; + ent = __proc_create(&parent, name, S_IFDIR | mode, 2); if (ent) { + ent->data = data; if (proc_register(parent, ent) < 0) { kfree(ent); ent = NULL; @@ -650,82 +446,39 @@ struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode, } return ent; } -EXPORT_SYMBOL(proc_mkdir_mode); +EXPORT_SYMBOL_GPL(proc_mkdir_data); -struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name, - struct proc_dir_entry *parent) +struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode, + struct proc_dir_entry *parent) { - struct proc_dir_entry *ent; - - ent = __proc_create(&parent, name, S_IFDIR | S_IRUGO | S_IXUGO, 2); - if (ent) { - ent->data = net; - if (proc_register(parent, ent) < 0) { - kfree(ent); - ent = NULL; - } - } - return ent; + return proc_mkdir_data(name, mode, parent, NULL); } -EXPORT_SYMBOL_GPL(proc_net_mkdir); +EXPORT_SYMBOL(proc_mkdir_mode); struct proc_dir_entry *proc_mkdir(const char *name, struct proc_dir_entry *parent) { - return proc_mkdir_mode(name, S_IRUGO | S_IXUGO, parent); + return proc_mkdir_data(name, 0, parent, NULL); } EXPORT_SYMBOL(proc_mkdir); -struct proc_dir_entry *create_proc_entry(const char *name, umode_t mode, - struct proc_dir_entry *parent) -{ - struct proc_dir_entry *ent; - nlink_t nlink; - - if (S_ISDIR(mode)) { - if ((mode & S_IALLUGO) == 0) - mode |= S_IRUGO | S_IXUGO; - nlink = 2; - } else { - if ((mode & S_IFMT) == 0) - mode |= S_IFREG; - if ((mode & S_IALLUGO) == 0) - mode |= S_IRUGO; - nlink = 1; - } - - ent = __proc_create(&parent, name, mode, nlink); - if (ent) { - if (proc_register(parent, ent) < 0) { - kfree(ent); - ent = NULL; - } - } - return ent; -} -EXPORT_SYMBOL(create_proc_entry); - struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct file_operations *proc_fops, void *data) { struct proc_dir_entry *pde; - nlink_t nlink; + if ((mode & S_IFMT) == 0) + mode |= S_IFREG; - if (S_ISDIR(mode)) { - if ((mode & S_IALLUGO) == 0) - mode |= S_IRUGO | S_IXUGO; - nlink = 2; - } else { - if ((mode & S_IFMT) == 0) - mode |= S_IFREG; - if ((mode & S_IALLUGO) == 0) - mode |= S_IRUGO; - nlink = 1; + if (!S_ISREG(mode)) { + WARN_ON(1); /* use proc_mkdir() */ + return NULL; } - pde = __proc_create(&parent, name, mode, nlink); + if ((mode & S_IALLUGO) == 0) + mode |= S_IRUGO; + pde = __proc_create(&parent, name, mode, 1); if (!pde) goto out; pde->proc_fops = proc_fops; @@ -739,6 +492,19 @@ out: return NULL; } EXPORT_SYMBOL(proc_create_data); + +void proc_set_size(struct proc_dir_entry *de, loff_t size) +{ + de->size = size; +} +EXPORT_SYMBOL(proc_set_size); + +void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid) +{ + de->uid = uid; + de->gid = gid; +} +EXPORT_SYMBOL(proc_set_user); static void free_proc_entry(struct proc_dir_entry *de) { @@ -786,37 +552,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) return; } - spin_lock(&de->pde_unload_lock); - /* - * Stop accepting new callers into module. If you're - * dynamically allocating ->proc_fops, save a pointer somewhere. - */ - de->proc_fops = NULL; - /* Wait until all existing callers into module are done. */ - if (de->pde_users > 0) { - DECLARE_COMPLETION_ONSTACK(c); - - if (!de->pde_unload_completion) - de->pde_unload_completion = &c; - - spin_unlock(&de->pde_unload_lock); - - wait_for_completion(de->pde_unload_completion); - - spin_lock(&de->pde_unload_lock); - } - - while (!list_empty(&de->pde_openers)) { - struct pde_opener *pdeo; - - pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh); - list_del(&pdeo->lh); - spin_unlock(&de->pde_unload_lock); - pdeo->release(pdeo->inode, pdeo->file); - kfree(pdeo); - spin_lock(&de->pde_unload_lock); - } - spin_unlock(&de->pde_unload_lock); + proc_entry_rundown(de); if (S_ISDIR(de->mode)) parent->nlink--; @@ -827,3 +563,77 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) pde_put(de); } EXPORT_SYMBOL(remove_proc_entry); + +int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) +{ + struct proc_dir_entry **p; + struct proc_dir_entry *root = NULL, *de, *next; + const char *fn = name; + unsigned int len; + + spin_lock(&proc_subdir_lock); + if (__xlate_proc_name(name, &parent, &fn) != 0) { + spin_unlock(&proc_subdir_lock); + return -ENOENT; + } + len = strlen(fn); + + for (p = &parent->subdir; *p; p=&(*p)->next ) { + if (proc_match(len, fn, *p)) { + root = *p; + *p = root->next; + root->next = NULL; + break; + } + } + if (!root) { + spin_unlock(&proc_subdir_lock); + return -ENOENT; + } + de = root; + while (1) { + next = de->subdir; + if (next) { + de->subdir = next->next; + next->next = NULL; + de = next; + continue; + } + spin_unlock(&proc_subdir_lock); + + proc_entry_rundown(de); + next = de->parent; + if (S_ISDIR(de->mode)) + next->nlink--; + de->nlink = 0; + if (de == root) + break; + pde_put(de); + + spin_lock(&proc_subdir_lock); + de = next; + } + pde_put(root); + return 0; +} +EXPORT_SYMBOL(remove_proc_subtree); + +void *proc_get_parent_data(const struct inode *inode) +{ + struct proc_dir_entry *de = PDE(inode); + return de->parent->data; +} +EXPORT_SYMBOL_GPL(proc_get_parent_data); + +void proc_remove(struct proc_dir_entry *de) +{ + if (de) + remove_proc_subtree(de->name, de->parent); +} +EXPORT_SYMBOL(proc_remove); + +void *PDE_DATA(const struct inode *inode) +{ + return __PDE_DATA(inode); +} +EXPORT_SYMBOL(PDE_DATA); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index a86aebc9ba7c..073aea60cf8f 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -22,6 +22,7 @@ #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/mount.h> +#include <linux/magic.h> #include <asm/uaccess.h> @@ -50,8 +51,8 @@ static void proc_evict_inode(struct inode *inode) sysctl_head_put(head); } /* Release any associated namespace */ - ns_ops = PROC_I(inode)->ns_ops; - ns = PROC_I(inode)->ns; + ns_ops = PROC_I(inode)->ns.ns_ops; + ns = PROC_I(inode)->ns.ns; if (ns_ops && ns) ns_ops->put(ns); } @@ -72,8 +73,8 @@ static struct inode *proc_alloc_inode(struct super_block *sb) ei->pde = NULL; ei->sysctl = NULL; ei->sysctl_entry = NULL; - ei->ns = NULL; - ei->ns_ops = NULL; + ei->ns.ns = NULL; + ei->ns.ns_ops = NULL; inode = &ei->vfs_inode; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; return inode; @@ -129,96 +130,100 @@ static const struct super_operations proc_sops = { .show_options = proc_show_options, }; -static void __pde_users_dec(struct proc_dir_entry *pde) +enum {BIAS = -1U<<31}; + +static inline int use_pde(struct proc_dir_entry *pde) +{ + return atomic_inc_unless_negative(&pde->in_use); +} + +static void unuse_pde(struct proc_dir_entry *pde) { - pde->pde_users--; - if (pde->pde_unload_completion && pde->pde_users == 0) + if (atomic_dec_return(&pde->in_use) == BIAS) complete(pde->pde_unload_completion); } -void pde_users_dec(struct proc_dir_entry *pde) +/* pde is locked */ +static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) { - spin_lock(&pde->pde_unload_lock); - __pde_users_dec(pde); - spin_unlock(&pde->pde_unload_lock); + if (pdeo->closing) { + /* somebody else is doing that, just wait */ + DECLARE_COMPLETION_ONSTACK(c); + pdeo->c = &c; + spin_unlock(&pde->pde_unload_lock); + wait_for_completion(&c); + spin_lock(&pde->pde_unload_lock); + } else { + struct file *file; + pdeo->closing = 1; + spin_unlock(&pde->pde_unload_lock); + file = pdeo->file; + pde->proc_fops->release(file_inode(file), file); + spin_lock(&pde->pde_unload_lock); + list_del_init(&pdeo->lh); + if (pdeo->c) + complete(pdeo->c); + kfree(pdeo); + } +} + +void proc_entry_rundown(struct proc_dir_entry *de) +{ + DECLARE_COMPLETION_ONSTACK(c); + /* Wait until all existing callers into module are done. */ + de->pde_unload_completion = &c; + if (atomic_add_return(BIAS, &de->in_use) != BIAS) + wait_for_completion(&c); + + spin_lock(&de->pde_unload_lock); + while (!list_empty(&de->pde_openers)) { + struct pde_opener *pdeo; + pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh); + close_pdeo(de, pdeo); + } + spin_unlock(&de->pde_unload_lock); } static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) { struct proc_dir_entry *pde = PDE(file_inode(file)); loff_t rv = -EINVAL; - loff_t (*llseek)(struct file *, loff_t, int); - - spin_lock(&pde->pde_unload_lock); - /* - * remove_proc_entry() is going to delete PDE (as part of module - * cleanup sequence). No new callers into module allowed. - */ - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; + if (use_pde(pde)) { + loff_t (*llseek)(struct file *, loff_t, int); + llseek = pde->proc_fops->llseek; + if (!llseek) + llseek = default_llseek; + rv = llseek(file, offset, whence); + unuse_pde(pde); } - /* - * Bump refcount so that remove_proc_entry will wail for ->llseek to - * complete. - */ - pde->pde_users++; - /* - * Save function pointer under lock, to protect against ->proc_fops - * NULL'ifying right after ->pde_unload_lock is dropped. - */ - llseek = pde->proc_fops->llseek; - spin_unlock(&pde->pde_unload_lock); - - if (!llseek) - llseek = default_llseek; - rv = llseek(file, offset, whence); - - pde_users_dec(pde); return rv; } static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { + ssize_t (*read)(struct file *, char __user *, size_t, loff_t *); struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; - ssize_t (*read)(struct file *, char __user *, size_t, loff_t *); - - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; + if (use_pde(pde)) { + read = pde->proc_fops->read; + if (read) + rv = read(file, buf, count, ppos); + unuse_pde(pde); } - pde->pde_users++; - read = pde->proc_fops->read; - spin_unlock(&pde->pde_unload_lock); - - if (read) - rv = read(file, buf, count, ppos); - - pde_users_dec(pde); return rv; } static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { + ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *); struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; - ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *); - - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; + if (use_pde(pde)) { + write = pde->proc_fops->write; + if (write) + rv = write(file, buf, count, ppos); + unuse_pde(pde); } - pde->pde_users++; - write = pde->proc_fops->write; - spin_unlock(&pde->pde_unload_lock); - - if (write) - rv = write(file, buf, count, ppos); - - pde_users_dec(pde); return rv; } @@ -227,20 +232,12 @@ static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *p struct proc_dir_entry *pde = PDE(file_inode(file)); unsigned int rv = DEFAULT_POLLMASK; unsigned int (*poll)(struct file *, struct poll_table_struct *); - - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; + if (use_pde(pde)) { + poll = pde->proc_fops->poll; + if (poll) + rv = poll(file, pts); + unuse_pde(pde); } - pde->pde_users++; - poll = pde->proc_fops->poll; - spin_unlock(&pde->pde_unload_lock); - - if (poll) - rv = poll(file, pts); - - pde_users_dec(pde); return rv; } @@ -249,20 +246,12 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; long (*ioctl)(struct file *, unsigned int, unsigned long); - - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; + if (use_pde(pde)) { + ioctl = pde->proc_fops->unlocked_ioctl; + if (ioctl) + rv = ioctl(file, cmd, arg); + unuse_pde(pde); } - pde->pde_users++; - ioctl = pde->proc_fops->unlocked_ioctl; - spin_unlock(&pde->pde_unload_lock); - - if (ioctl) - rv = ioctl(file, cmd, arg); - - pde_users_dec(pde); return rv; } @@ -272,20 +261,12 @@ static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; long (*compat_ioctl)(struct file *, unsigned int, unsigned long); - - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; + if (use_pde(pde)) { + compat_ioctl = pde->proc_fops->compat_ioctl; + if (compat_ioctl) + rv = compat_ioctl(file, cmd, arg); + unuse_pde(pde); } - pde->pde_users++; - compat_ioctl = pde->proc_fops->compat_ioctl; - spin_unlock(&pde->pde_unload_lock); - - if (compat_ioctl) - rv = compat_ioctl(file, cmd, arg); - - pde_users_dec(pde); return rv; } #endif @@ -295,20 +276,12 @@ static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) struct proc_dir_entry *pde = PDE(file_inode(file)); int rv = -EIO; int (*mmap)(struct file *, struct vm_area_struct *); - - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); - return rv; + if (use_pde(pde)) { + mmap = pde->proc_fops->mmap; + if (mmap) + rv = mmap(file, vma); + unuse_pde(pde); } - pde->pde_users++; - mmap = pde->proc_fops->mmap; - spin_unlock(&pde->pde_unload_lock); - - if (mmap) - rv = mmap(file, vma); - - pde_users_dec(pde); return rv; } @@ -330,91 +303,47 @@ static int proc_reg_open(struct inode *inode, struct file *file) * by hand in remove_proc_entry(). For this, save opener's credentials * for later. */ - pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL); + pdeo = kzalloc(sizeof(struct pde_opener), GFP_KERNEL); if (!pdeo) return -ENOMEM; - spin_lock(&pde->pde_unload_lock); - if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); + if (!use_pde(pde)) { kfree(pdeo); return -ENOENT; } - pde->pde_users++; open = pde->proc_fops->open; release = pde->proc_fops->release; - spin_unlock(&pde->pde_unload_lock); if (open) rv = open(inode, file); - spin_lock(&pde->pde_unload_lock); if (rv == 0 && release) { /* To know what to release. */ - pdeo->inode = inode; pdeo->file = file; /* Strictly for "too late" ->release in proc_reg_release(). */ - pdeo->release = release; + spin_lock(&pde->pde_unload_lock); list_add(&pdeo->lh, &pde->pde_openers); + spin_unlock(&pde->pde_unload_lock); } else kfree(pdeo); - __pde_users_dec(pde); - spin_unlock(&pde->pde_unload_lock); - return rv; -} - -static struct pde_opener *find_pde_opener(struct proc_dir_entry *pde, - struct inode *inode, struct file *file) -{ - struct pde_opener *pdeo; - list_for_each_entry(pdeo, &pde->pde_openers, lh) { - if (pdeo->inode == inode && pdeo->file == file) - return pdeo; - } - return NULL; + unuse_pde(pde); + return rv; } static int proc_reg_release(struct inode *inode, struct file *file) { struct proc_dir_entry *pde = PDE(inode); - int rv = 0; - int (*release)(struct inode *, struct file *); struct pde_opener *pdeo; - spin_lock(&pde->pde_unload_lock); - pdeo = find_pde_opener(pde, inode, file); - if (!pde->proc_fops) { - /* - * Can't simply exit, __fput() will think that everything is OK, - * and move on to freeing struct file. remove_proc_entry() will - * find slacker in opener's list and will try to do non-trivial - * things with struct file. Therefore, remove opener from list. - * - * But if opener is removed from list, who will ->release it? - */ - if (pdeo) { - list_del(&pdeo->lh); - spin_unlock(&pde->pde_unload_lock); - rv = pdeo->release(inode, file); - kfree(pdeo); - } else - spin_unlock(&pde->pde_unload_lock); - return rv; - } - pde->pde_users++; - release = pde->proc_fops->release; - if (pdeo) { - list_del(&pdeo->lh); - kfree(pdeo); + list_for_each_entry(pdeo, &pde->pde_openers, lh) { + if (pdeo->file == file) { + close_pdeo(pde, pdeo); + break; + } } spin_unlock(&pde->pde_unload_lock); - - if (release) - rv = release(inode, file); - - pde_users_dec(pde); - return rv; + return 0; } static const struct file_operations proc_reg_file_ops = { @@ -446,9 +375,10 @@ static const struct file_operations proc_reg_file_ops_no_compat = { struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) { - struct inode *inode = iget_locked(sb, de->low_ino); + struct inode *inode = new_inode_pseudo(sb); - if (inode && (inode->i_state & I_NEW)) { + if (inode) { + inode->i_ino = de->low_ino; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; PROC_I(inode)->pde = de; @@ -461,8 +391,8 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) inode->i_size = de->size; if (de->nlink) set_nlink(inode, de->nlink); - if (de->proc_iops) - inode->i_op = de->proc_iops; + WARN_ON(!de->proc_iops); + inode->i_op = de->proc_iops; if (de->proc_fops) { if (S_ISREG(inode->i_mode)) { #ifdef CONFIG_COMPAT @@ -476,7 +406,6 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) inode->i_fop = de->proc_fops; } } - unlock_new_inode(inode); } else pde_put(de); return inode; @@ -506,5 +435,5 @@ int proc_fill_super(struct super_block *s) return -ENOMEM; } - return 0; + return proc_setup_self(s); } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 85ff3a4598b3..d600fb098b6a 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -1,4 +1,4 @@ -/* internal.h: internal procfs definitions +/* Internal procfs definitions * * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) @@ -9,80 +9,83 @@ * 2 of the License, or (at your option) any later version. */ -#include <linux/sched.h> #include <linux/proc_fs.h> +#include <linux/proc_ns.h> +#include <linux/spinlock.h> +#include <linux/atomic.h> #include <linux/binfmts.h> -struct ctl_table_header; -struct mempolicy; -extern struct proc_dir_entry proc_root; -extern void proc_self_init(void); -#ifdef CONFIG_PROC_SYSCTL -extern int proc_sys_init(void); -extern void sysctl_head_put(struct ctl_table_header *head); -#else -static inline void proc_sys_init(void) { } -static inline void sysctl_head_put(struct ctl_table_header *head) { } -#endif -#ifdef CONFIG_NET -extern int proc_net_init(void); -#else -static inline int proc_net_init(void) { return 0; } -#endif +struct ctl_table_header; +struct mempolicy; -struct vmalloc_info { - unsigned long used; - unsigned long largest_chunk; +/* + * This is not completely implemented yet. The idea is to + * create an in-memory tree (like the actual /proc filesystem + * tree) of these proc_dir_entries, so that we can dynamically + * add new files to /proc. + * + * The "next" pointer creates a linked list of one /proc directory, + * while parent/subdir create the directory structure (every + * /proc file has a parent, but "subdir" is NULL for all + * non-directory entries). + */ +struct proc_dir_entry { + unsigned int low_ino; + umode_t mode; + nlink_t nlink; + kuid_t uid; + kgid_t gid; + loff_t size; + const struct inode_operations *proc_iops; + const struct file_operations *proc_fops; + struct proc_dir_entry *next, *parent, *subdir; + void *data; + atomic_t count; /* use count */ + atomic_t in_use; /* number of callers into module in progress; */ + /* negative -> it's going away RSN */ + struct completion *pde_unload_completion; + struct list_head pde_openers; /* who did ->open, but not ->release */ + spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */ + u8 namelen; + char name[]; }; -#ifdef CONFIG_MMU -#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) -extern void get_vmalloc_info(struct vmalloc_info *vmi); -#else - -#define VMALLOC_TOTAL 0UL -#define get_vmalloc_info(vmi) \ -do { \ - (vmi)->used = 0; \ - (vmi)->largest_chunk = 0; \ -} while(0) -#endif - -extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *task); -extern int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *task); -extern int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *task); -extern int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *task); -extern loff_t mem_lseek(struct file *file, loff_t offset, int orig); - -extern const struct file_operations proc_tid_children_operations; -extern const struct file_operations proc_pid_maps_operations; -extern const struct file_operations proc_tid_maps_operations; -extern const struct file_operations proc_pid_numa_maps_operations; -extern const struct file_operations proc_tid_numa_maps_operations; -extern const struct file_operations proc_pid_smaps_operations; -extern const struct file_operations proc_tid_smaps_operations; -extern const struct file_operations proc_clear_refs_operations; -extern const struct file_operations proc_pagemap_operations; -extern const struct file_operations proc_net_operations; -extern const struct inode_operations proc_net_inode_operations; -extern const struct inode_operations proc_pid_link_inode_operations; +union proc_op { + int (*proc_get_link)(struct dentry *, struct path *); + int (*proc_read)(struct task_struct *task, char *page); + int (*proc_show)(struct seq_file *m, + struct pid_namespace *ns, struct pid *pid, + struct task_struct *task); +}; -struct proc_maps_private { +struct proc_inode { struct pid *pid; - struct task_struct *task; -#ifdef CONFIG_MMU - struct vm_area_struct *tail_vma; -#endif -#ifdef CONFIG_NUMA - struct mempolicy *task_mempolicy; -#endif + int fd; + union proc_op op; + struct proc_dir_entry *pde; + struct ctl_table_header *sysctl; + struct ctl_table *sysctl_entry; + struct proc_ns ns; + struct inode vfs_inode; }; -void proc_init_inodecache(void); +/* + * General functions + */ +static inline struct proc_inode *PROC_I(const struct inode *inode) +{ + return container_of(inode, struct proc_inode, vfs_inode); +} + +static inline struct proc_dir_entry *PDE(const struct inode *inode) +{ + return PROC_I(inode)->pde; +} + +static inline void *__PDE_DATA(const struct inode *inode) +{ + return PDE(inode)->data; +} static inline struct pid *proc_pid(struct inode *inode) { @@ -94,11 +97,6 @@ static inline struct task_struct *get_proc_task(struct inode *inode) return get_pid_task(proc_pid(inode), PIDTYPE_PID); } -static inline int proc_fd(struct inode *inode) -{ - return PROC_I(inode)->fd; -} - static inline int task_dumpable(struct task_struct *task) { int dumpable = 0; @@ -114,15 +112,6 @@ static inline int task_dumpable(struct task_struct *task) return 0; } -static inline int pid_delete_dentry(const struct dentry * dentry) -{ - /* Is the task we represent dead? - * If so, then don't put the dentry on the lru list, - * kill it immediately. - */ - return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first; -} - static inline unsigned name_to_int(struct dentry *dentry) { const char *name = dentry->d_name.name; @@ -145,63 +134,165 @@ out: return ~0U; } -struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino, - struct dentry *dentry); -int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, - filldir_t filldir); +/* + * Offset of the first process in the /proc root directory.. + */ +#define FIRST_PROCESS_ENTRY 256 -struct pde_opener { - struct inode *inode; - struct file *file; - int (*release)(struct inode *, struct file *); - struct list_head lh; -}; -void pde_users_dec(struct proc_dir_entry *pde); +/* Worst case buffer size needed for holding an integer. */ +#define PROC_NUMBUF 13 + +/* + * array.c + */ +extern const struct file_operations proc_tid_children_operations; + +extern int proc_tid_stat(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); +extern int proc_tgid_stat(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); +extern int proc_pid_status(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); +extern int proc_pid_statm(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); + +/* + * base.c + */ +extern const struct dentry_operations pid_dentry_operations; +extern int pid_getattr(struct vfsmount *, struct dentry *, struct kstat *); +extern int proc_setattr(struct dentry *, struct iattr *); +extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *); +extern int pid_revalidate(struct dentry *, unsigned int); +extern int pid_delete_dentry(const struct dentry *); +extern int proc_pid_readdir(struct file *, void *, filldir_t); +extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int); +extern loff_t mem_lseek(struct file *, loff_t, int); + +/* Lookups */ +typedef struct dentry *instantiate_t(struct inode *, struct dentry *, + struct task_struct *, const void *); +extern int proc_fill_cache(struct file *, void *, filldir_t, const char *, int, + instantiate_t, struct task_struct *, const void *); +/* + * generic.c + */ extern spinlock_t proc_subdir_lock; -struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int); -int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir); -unsigned long task_vsize(struct mm_struct *); -unsigned long task_statm(struct mm_struct *, - unsigned long *, unsigned long *, unsigned long *, unsigned long *); -void task_mem(struct seq_file *, struct mm_struct *); +extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); +extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *, + struct dentry *); +extern int proc_readdir(struct file *, void *, filldir_t); +extern int proc_readdir_de(struct proc_dir_entry *, struct file *, void *, filldir_t); static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) { atomic_inc(&pde->count); return pde; } -void pde_put(struct proc_dir_entry *pde); - -int proc_fill_super(struct super_block *); -struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); -int proc_remount(struct super_block *sb, int *flags, char *data); +extern void pde_put(struct proc_dir_entry *); /* - * These are generic /proc routines that use the internal - * "struct proc_dir_entry" tree to traverse the filesystem. - * - * The /proc root directory has extended versions to take care - * of the /proc/<pid> subdirectories. + * inode.c */ -int proc_readdir(struct file *, void *, filldir_t); -struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); +struct pde_opener { + struct file *file; + struct list_head lh; + int closing; + struct completion *c; +}; +extern const struct inode_operations proc_pid_link_inode_operations; +extern void proc_init_inodecache(void); +extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); +extern int proc_fill_super(struct super_block *); +extern void proc_entry_rundown(struct proc_dir_entry *); -/* Lookups */ -typedef struct dentry *instantiate_t(struct inode *, struct dentry *, - struct task_struct *, const void *); -int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, - const char *name, int len, - instantiate_t instantiate, struct task_struct *task, const void *ptr); -int pid_revalidate(struct dentry *dentry, unsigned int flags); -struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task); -extern const struct dentry_operations pid_dentry_operations; -int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); -int proc_setattr(struct dentry *dentry, struct iattr *attr); +/* + * proc_devtree.c + */ +#ifdef CONFIG_PROC_DEVICETREE +extern void proc_device_tree_init(void); +#endif +/* + * proc_namespaces.c + */ extern const struct inode_operations proc_ns_dir_inode_operations; extern const struct file_operations proc_ns_dir_operations; +/* + * proc_net.c + */ +extern const struct file_operations proc_net_operations; +extern const struct inode_operations proc_net_inode_operations; + +#ifdef CONFIG_NET +extern int proc_net_init(void); +#else +static inline int proc_net_init(void) { return 0; } +#endif + +/* + * proc_self.c + */ +extern int proc_setup_self(struct super_block *); + +/* + * proc_sysctl.c + */ +#ifdef CONFIG_PROC_SYSCTL +extern int proc_sys_init(void); +extern void sysctl_head_put(struct ctl_table_header *); +#else +static inline void proc_sys_init(void) { } +static inline void sysctl_head_put(struct ctl_table_header *head) { } +#endif + +/* + * proc_tty.c + */ +#ifdef CONFIG_TTY +extern void proc_tty_init(void); +#else +static inline void proc_tty_init(void) {} +#endif + +/* + * root.c + */ +extern struct proc_dir_entry proc_root; + +extern void proc_self_init(void); +extern int proc_remount(struct super_block *, int *, char *); + +/* + * task_[no]mmu.c + */ +struct proc_maps_private { + struct pid *pid; + struct task_struct *task; +#ifdef CONFIG_MMU + struct vm_area_struct *tail_vma; +#endif +#ifdef CONFIG_NUMA + struct mempolicy *task_mempolicy; +#endif +}; + +extern const struct file_operations proc_pid_maps_operations; +extern const struct file_operations proc_tid_maps_operations; +extern const struct file_operations proc_pid_numa_maps_operations; +extern const struct file_operations proc_tid_numa_maps_operations; +extern const struct file_operations proc_pid_smaps_operations; +extern const struct file_operations proc_tid_smaps_operations; +extern const struct file_operations proc_clear_refs_operations; +extern const struct file_operations proc_pagemap_operations; + +extern unsigned long task_vsize(struct mm_struct *); +extern unsigned long task_statm(struct mm_struct *, + unsigned long *, unsigned long *, + unsigned long *, unsigned long *); +extern void task_mem(struct seq_file *, struct mm_struct *); diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index eda6f017f272..0a22194e5d58 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -11,10 +11,12 @@ #include <linux/mm.h> #include <linux/proc_fs.h> +#include <linux/kcore.h> #include <linux/user.h> #include <linux/capability.h> #include <linux/elf.h> #include <linux/elfcore.h> +#include <linux/notifier.h> #include <linux/vmalloc.h> #include <linux/highmem.h> #include <linux/printk.h> @@ -27,6 +29,7 @@ #include <linux/ioport.h> #include <linux/memory.h> #include <asm/sections.h> +#include "internal.h" #define CORE_STR "CORE" @@ -564,7 +567,6 @@ static const struct file_operations proc_kcore_operations = { .llseek = default_llseek, }; -#ifdef CONFIG_MEMORY_HOTPLUG /* just remember that we have to update kcore */ static int __meminit kcore_callback(struct notifier_block *self, unsigned long action, void *arg) @@ -578,8 +580,11 @@ static int __meminit kcore_callback(struct notifier_block *self, } return NOTIFY_OK; } -#endif +static struct notifier_block kcore_callback_nb __meminitdata = { + .notifier_call = kcore_callback, + .priority = 0, +}; static struct kcore_list kcore_vmalloc; @@ -631,7 +636,7 @@ static int __init proc_kcore_init(void) add_modules_range(); /* Store direct-map area from physical memory map */ kcore_update_ram(); - hotplug_memory_notifier(kcore_callback, 0); + register_hotmemory_notifier(&kcore_callback_nb); return 0; } diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 1efaaa19c4f3..5aa847a603c0 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -11,6 +11,7 @@ #include <linux/swap.h> #include <linux/vmstat.h> #include <linux/atomic.h> +#include <linux/vmalloc.h> #include <asm/page.h> #include <asm/pgtable.h> #include "internal.h" diff --git a/fs/proc/mmu.c b/fs/proc/mmu.c deleted file mode 100644 index 8ae221dfd010..000000000000 --- a/fs/proc/mmu.c +++ /dev/null @@ -1,60 +0,0 @@ -/* mmu.c: mmu memory info files - * - * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include <linux/spinlock.h> -#include <linux/vmalloc.h> -#include <linux/highmem.h> -#include <asm/pgtable.h> -#include "internal.h" - -void get_vmalloc_info(struct vmalloc_info *vmi) -{ - struct vm_struct *vma; - unsigned long free_area_size; - unsigned long prev_end; - - vmi->used = 0; - - if (!vmlist) { - vmi->largest_chunk = VMALLOC_TOTAL; - } - else { - vmi->largest_chunk = 0; - - prev_end = VMALLOC_START; - - read_lock(&vmlist_lock); - - for (vma = vmlist; vma; vma = vma->next) { - unsigned long addr = (unsigned long) vma->addr; - - /* - * Some archs keep another range for modules in vmlist - */ - if (addr < VMALLOC_START) - continue; - if (addr >= VMALLOC_END) - break; - - vmi->used += vma->size; - - free_area_size = addr - prev_end; - if (vmi->largest_chunk < free_area_size) - vmi->largest_chunk = free_area_size; - - prev_end = vma->size + addr; - } - - if (VMALLOC_END - prev_end > vmi->largest_chunk) - vmi->largest_chunk = VMALLOC_END - prev_end; - - read_unlock(&vmlist_lock); - } -} diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index b7a47196c8c3..54bdc6701e9f 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -51,7 +51,7 @@ static int ns_delete_dentry(const struct dentry *dentry) static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) { struct inode *inode = dentry->d_inode; - const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; + const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns.ns_ops; return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]", ns_ops->name, inode->i_ino); @@ -95,8 +95,8 @@ static struct dentry *proc_ns_get_dentry(struct super_block *sb, inode->i_op = &ns_inode_operations; inode->i_mode = S_IFREG | S_IRUGO; inode->i_fop = &ns_file_operations; - ei->ns_ops = ns_ops; - ei->ns = ns; + ei->ns.ns_ops = ns_ops; + ei->ns.ns = ns; unlock_new_inode(inode); } else { ns_ops->put(ns); @@ -118,7 +118,7 @@ static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd) struct super_block *sb = inode->i_sb; struct proc_inode *ei = PROC_I(inode); struct task_struct *task; - struct dentry *ns_dentry; + struct path ns_path; void *error = ERR_PTR(-EACCES); task = get_proc_task(inode); @@ -128,14 +128,14 @@ static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd) if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto out_put_task; - ns_dentry = proc_ns_get_dentry(sb, task, ei->ns_ops); - if (IS_ERR(ns_dentry)) { - error = ERR_CAST(ns_dentry); + ns_path.dentry = proc_ns_get_dentry(sb, task, ei->ns.ns_ops); + if (IS_ERR(ns_path.dentry)) { + error = ERR_CAST(ns_path.dentry); goto out_put_task; } - dput(nd->path.dentry); - nd->path.dentry = ns_dentry; + ns_path.mnt = mntget(nd->path.mnt); + nd_jump_link(nd, &ns_path); error = NULL; out_put_task: @@ -148,7 +148,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl { struct inode *inode = dentry->d_inode; struct proc_inode *ei = PROC_I(inode); - const struct proc_ns_operations *ns_ops = ei->ns_ops; + const struct proc_ns_operations *ns_ops = ei->ns.ns_ops; struct task_struct *task; void *ns; char name[50]; @@ -202,7 +202,7 @@ static struct dentry *proc_ns_instantiate(struct inode *dir, ei = PROC_I(inode); inode->i_mode = S_IFLNK|S_IRWXUGO; inode->i_op = &proc_ns_link_inode_operations; - ei->ns_ops = ns_ops; + ei->ns.ns_ops = ns_ops; d_set_d_op(dentry, &pid_dentry_operations); d_add(dentry, inode); @@ -337,6 +337,11 @@ out_invalid: return ERR_PTR(-EINVAL); } +struct proc_ns *get_proc_ns(struct inode *inode) +{ + return &PROC_I(inode)->ns; +} + bool proc_ns_inode(struct inode *inode) { return inode->i_fop == &ns_file_operations; diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c index 30b590f5bd35..106a83570630 100644 --- a/fs/proc/proc_devtree.c +++ b/fs/proc/proc_devtree.c @@ -12,7 +12,7 @@ #include <linux/stat.h> #include <linux/string.h> #include <linux/of.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/slab.h> #include <asm/prom.h> #include <asm/uaccess.h> @@ -41,7 +41,7 @@ static int property_proc_show(struct seq_file *m, void *v) static int property_proc_open(struct inode *inode, struct file *file) { - return single_open(file, property_proc_show, PDE(inode)->data); + return single_open(file, property_proc_show, __PDE_DATA(inode)); } static const struct file_operations property_proc_fops = { diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index b4ac6572474f..986e83220d56 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -26,6 +26,10 @@ #include "internal.h" +static inline struct net *PDE_NET(struct proc_dir_entry *pde) +{ + return pde->parent->data; +} static struct net *get_proc_net(const struct inode *inode) { diff --git a/fs/proc/root.c b/fs/proc/root.c index c6e9fac26bac..41a6ea93f486 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -16,6 +16,7 @@ #include <linux/sched.h> #include <linux/module.h> #include <linux/bitops.h> +#include <linux/user_namespace.h> #include <linux/mount.h> #include <linux/pid_namespace.h> #include <linux/parser.h> @@ -108,6 +109,9 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, } else { ns = task_active_pid_ns(current); options = data; + + if (!current_user_ns()->may_mount_proc) + return ERR_PTR(-EPERM); } sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns); @@ -137,6 +141,8 @@ static void proc_kill_sb(struct super_block *sb) struct pid_namespace *ns; ns = (struct pid_namespace *)sb->s_fs_info; + if (ns->proc_self) + dput(ns->proc_self); kill_anon_super(sb); put_pid_ns(ns); } diff --git a/fs/proc/self.c b/fs/proc/self.c index aa5cc3bff140..6b6a993b5c25 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -1,6 +1,8 @@ -#include <linux/proc_fs.h> #include <linux/sched.h> #include <linux/namei.h> +#include <linux/slab.h> +#include <linux/pid_namespace.h> +#include "internal.h" /* * /proc/self: @@ -48,12 +50,43 @@ static const struct inode_operations proc_self_inode_operations = { .put_link = proc_self_put_link, }; -void __init proc_self_init(void) +static unsigned self_inum; + +int proc_setup_self(struct super_block *s) { - struct proc_dir_entry *proc_self_symlink; - mode_t mode; + struct inode *root_inode = s->s_root->d_inode; + struct pid_namespace *ns = s->s_fs_info; + struct dentry *self; + + mutex_lock(&root_inode->i_mutex); + self = d_alloc_name(s->s_root, "self"); + if (self) { + struct inode *inode = new_inode_pseudo(s); + if (inode) { + inode->i_ino = self_inum; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mode = S_IFLNK | S_IRWXUGO; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_op = &proc_self_inode_operations; + d_add(self, inode); + } else { + dput(self); + self = ERR_PTR(-ENOMEM); + } + } else { + self = ERR_PTR(-ENOMEM); + } + mutex_unlock(&root_inode->i_mutex); + if (IS_ERR(self)) { + pr_err("proc_fill_super: can't allocate /proc/self\n"); + return PTR_ERR(self); + } + ns->proc_self = self; + return 0; +} - mode = S_IFLNK | S_IRWXUGO; - proc_self_symlink = proc_create("self", mode, NULL, NULL ); - proc_self_symlink->proc_iops = &proc_self_inode_operations; +void __init proc_self_init(void) +{ + proc_alloc_inum(&self_inum); } diff --git a/fs/proc/stat.c b/fs/proc/stat.c index e296572c73ed..1cf86c0e8689 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -184,7 +184,7 @@ static int show_stat(struct seq_file *p, void *v) static int stat_open(struct inode *inode, struct file *file) { - unsigned size = 1024 + 128 * num_possible_cpus(); + size_t size = 1024 + 128 * num_possible_cpus(); char *buf; struct seq_file *m; int res; diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index b870f740ab5a..17f7e080d7ff 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -8,7 +8,7 @@ */ #include <linux/mm.h> -#include <linux/proc_fs.h> +#include <linux/kcore.h> #include <linux/user.h> #include <linux/elf.h> #include <linux/elfcore.h> @@ -22,6 +22,7 @@ #include <linux/list.h> #include <asm/uaccess.h> #include <asm/io.h> +#include "internal.h" /* List representing chunks of contiguous memory areas and their offsets in * vmcore file. @@ -698,7 +699,7 @@ void vmcore_cleanup(void) struct list_head *pos, *next; if (proc_vmcore) { - remove_proc_entry(proc_vmcore->name, proc_vmcore->parent); + proc_remove(proc_vmcore); proc_vmcore = NULL; } diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 288f068740f6..32cbd7c8a90c 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -83,7 +83,7 @@ struct ramoops_context { size_t console_size; size_t ftrace_size; int dump_oops; - int ecc_size; + struct persistent_ram_ecc_info ecc_info; unsigned int max_dump_cnt; unsigned int dump_write_cnt; unsigned int dump_read_cnt; @@ -136,6 +136,7 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, char **buf, struct pstore_info *psi) { ssize_t size; + ssize_t ecc_notice_size; struct ramoops_context *cxt = psi->data; struct persistent_ram_zone *prz; @@ -156,12 +157,18 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, time->tv_nsec = 0; size = persistent_ram_old_size(prz); - *buf = kmalloc(size, GFP_KERNEL); + + /* ECC correction notice */ + ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0); + + *buf = kmalloc(size + ecc_notice_size + 1, GFP_KERNEL); if (*buf == NULL) return -ENOMEM; + memcpy(*buf, persistent_ram_old(prz), size); + persistent_ram_ecc_string(prz, *buf + size, ecc_notice_size + 1); - return size; + return size + ecc_notice_size; } static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz) @@ -323,7 +330,8 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt, for (i = 0; i < cxt->max_dump_cnt; i++) { size_t sz = cxt->record_size; - cxt->przs[i] = persistent_ram_new(*paddr, sz, 0, cxt->ecc_size); + cxt->przs[i] = persistent_ram_new(*paddr, sz, 0, + &cxt->ecc_info); if (IS_ERR(cxt->przs[i])) { err = PTR_ERR(cxt->przs[i]); dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n", @@ -353,7 +361,7 @@ static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt, return -ENOMEM; } - *prz = persistent_ram_new(*paddr, sz, sig, cxt->ecc_size); + *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info); if (IS_ERR(*prz)) { int err = PTR_ERR(*prz); @@ -407,7 +415,7 @@ static int ramoops_probe(struct platform_device *pdev) cxt->console_size = pdata->console_size; cxt->ftrace_size = pdata->ftrace_size; cxt->dump_oops = pdata->dump_oops; - cxt->ecc_size = pdata->ecc_size; + cxt->ecc_info = pdata->ecc_info; paddr = cxt->phys_addr; @@ -465,9 +473,9 @@ static int ramoops_probe(struct platform_device *pdev) record_size = pdata->record_size; dump_oops = pdata->dump_oops; - pr_info("attached 0x%lx@0x%llx, ecc: %d\n", + pr_info("attached 0x%lx@0x%llx, ecc: %d/%d\n", cxt->size, (unsigned long long)cxt->phys_addr, - cxt->ecc_size); + cxt->ecc_info.ecc_size, cxt->ecc_info.block_size); return 0; @@ -539,7 +547,7 @@ static void ramoops_register_dummy(void) * For backwards compatibility ramoops.ecc=1 means 16 bytes ECC * (using 1 byte for ECC isn't much of use anyway). */ - dummy_data->ecc_size = ramoops_ecc == 1 ? 16 : ramoops_ecc; + dummy_data->ecc_info.ecc_size = ramoops_ecc == 1 ? 16 : ramoops_ecc; dummy = platform_device_register_data(NULL, "ramoops", -1, dummy_data, sizeof(struct ramoops_platform_data)); diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 0306303be372..59337326e288 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -82,12 +82,12 @@ static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz, uint8_t *data, size_t len, uint8_t *ecc) { int i; - uint16_t par[prz->ecc_size]; + uint16_t par[prz->ecc_info.ecc_size]; /* Initialize the parity buffer */ memset(par, 0, sizeof(par)); encode_rs8(prz->rs_decoder, data, len, par, 0); - for (i = 0; i < prz->ecc_size; i++) + for (i = 0; i < prz->ecc_info.ecc_size; i++) ecc[i] = par[i]; } @@ -95,9 +95,9 @@ static int persistent_ram_decode_rs8(struct persistent_ram_zone *prz, void *data, size_t len, uint8_t *ecc) { int i; - uint16_t par[prz->ecc_size]; + uint16_t par[prz->ecc_info.ecc_size]; - for (i = 0; i < prz->ecc_size; i++) + for (i = 0; i < prz->ecc_info.ecc_size; i++) par[i] = ecc[i]; return decode_rs8(prz->rs_decoder, data, par, len, NULL, 0, NULL, 0, NULL); @@ -110,15 +110,15 @@ static void notrace persistent_ram_update_ecc(struct persistent_ram_zone *prz, uint8_t *buffer_end = buffer->data + prz->buffer_size; uint8_t *block; uint8_t *par; - int ecc_block_size = prz->ecc_block_size; - int ecc_size = prz->ecc_size; - int size = prz->ecc_block_size; + int ecc_block_size = prz->ecc_info.block_size; + int ecc_size = prz->ecc_info.ecc_size; + int size = ecc_block_size; - if (!prz->ecc_size) + if (!ecc_size) return; block = buffer->data + (start & ~(ecc_block_size - 1)); - par = prz->par_buffer + (start / ecc_block_size) * prz->ecc_size; + par = prz->par_buffer + (start / ecc_block_size) * ecc_size; do { if (block + ecc_block_size > buffer_end) @@ -133,7 +133,7 @@ static void persistent_ram_update_header_ecc(struct persistent_ram_zone *prz) { struct persistent_ram_buffer *buffer = prz->buffer; - if (!prz->ecc_size) + if (!prz->ecc_info.ecc_size) return; persistent_ram_encode_rs8(prz, (uint8_t *)buffer, sizeof(*buffer), @@ -146,14 +146,14 @@ static void persistent_ram_ecc_old(struct persistent_ram_zone *prz) uint8_t *block; uint8_t *par; - if (!prz->ecc_size) + if (!prz->ecc_info.ecc_size) return; block = buffer->data; par = prz->par_buffer; while (block < buffer->data + buffer_size(prz)) { int numerr; - int size = prz->ecc_block_size; + int size = prz->ecc_info.block_size; if (block + size > buffer->data + prz->buffer_size) size = buffer->data + prz->buffer_size - block; numerr = persistent_ram_decode_rs8(prz, block, size, par); @@ -166,44 +166,49 @@ static void persistent_ram_ecc_old(struct persistent_ram_zone *prz) block); prz->bad_blocks++; } - block += prz->ecc_block_size; - par += prz->ecc_size; + block += prz->ecc_info.block_size; + par += prz->ecc_info.ecc_size; } } static int persistent_ram_init_ecc(struct persistent_ram_zone *prz, - int ecc_size) + struct persistent_ram_ecc_info *ecc_info) { int numerr; struct persistent_ram_buffer *buffer = prz->buffer; int ecc_blocks; size_t ecc_total; - int ecc_symsize = 8; - int ecc_poly = 0x11d; - if (!ecc_size) + if (!ecc_info || !ecc_info->ecc_size) return 0; - prz->ecc_block_size = 128; - prz->ecc_size = ecc_size; + prz->ecc_info.block_size = ecc_info->block_size ?: 128; + prz->ecc_info.ecc_size = ecc_info->ecc_size ?: 16; + prz->ecc_info.symsize = ecc_info->symsize ?: 8; + prz->ecc_info.poly = ecc_info->poly ?: 0x11d; - ecc_blocks = DIV_ROUND_UP(prz->buffer_size, prz->ecc_block_size); - ecc_total = (ecc_blocks + 1) * prz->ecc_size; + ecc_blocks = DIV_ROUND_UP(prz->buffer_size - prz->ecc_info.ecc_size, + prz->ecc_info.block_size + + prz->ecc_info.ecc_size); + ecc_total = (ecc_blocks + 1) * prz->ecc_info.ecc_size; if (ecc_total >= prz->buffer_size) { pr_err("%s: invalid ecc_size %u (total %zu, buffer size %zu)\n", - __func__, prz->ecc_size, ecc_total, prz->buffer_size); + __func__, prz->ecc_info.ecc_size, + ecc_total, prz->buffer_size); return -EINVAL; } prz->buffer_size -= ecc_total; prz->par_buffer = buffer->data + prz->buffer_size; - prz->par_header = prz->par_buffer + ecc_blocks * prz->ecc_size; + prz->par_header = prz->par_buffer + + ecc_blocks * prz->ecc_info.ecc_size; /* * first consecutive root is 0 * primitive element to generate roots = 1 */ - prz->rs_decoder = init_rs(ecc_symsize, ecc_poly, 0, 1, prz->ecc_size); + prz->rs_decoder = init_rs(prz->ecc_info.symsize, prz->ecc_info.poly, + 0, 1, prz->ecc_info.ecc_size); if (prz->rs_decoder == NULL) { pr_info("persistent_ram: init_rs failed\n"); return -EINVAL; @@ -230,6 +235,9 @@ ssize_t persistent_ram_ecc_string(struct persistent_ram_zone *prz, { ssize_t ret; + if (!prz->ecc_info.ecc_size) + return 0; + if (prz->corrected_bytes || prz->bad_blocks) ret = snprintf(str, len, "" "\n%d Corrected bytes, %d unrecoverable blocks\n", @@ -391,11 +399,11 @@ static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size, } static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig, - int ecc_size) + struct persistent_ram_ecc_info *ecc_info) { int ret; - ret = persistent_ram_init_ecc(prz, ecc_size); + ret = persistent_ram_init_ecc(prz, ecc_info); if (ret) return ret; @@ -444,7 +452,7 @@ void persistent_ram_free(struct persistent_ram_zone *prz) } struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, - u32 sig, int ecc_size) + u32 sig, struct persistent_ram_ecc_info *ecc_info) { struct persistent_ram_zone *prz; int ret = -ENOMEM; @@ -459,7 +467,7 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, if (ret) goto err; - ret = persistent_ram_post_init(prz, sig, ecc_size); + ret = persistent_ram_post_init(prz, sig, ecc_info); if (ret) goto err; diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 43098bb5723a..2e8caa62da78 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -412,6 +412,7 @@ static struct file_system_type qnx4_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("qnx4"); static int __init init_qnx4_fs(void) { diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 57199a52a351..8d941edfefa1 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -672,6 +672,7 @@ static struct file_system_type qnx6_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("qnx6"); static int __init init_qnx6_fs(void) { diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 05ae3c97f7a5..3e64169ef527 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1439,8 +1439,11 @@ static void __dquot_initialize(struct inode *inode, int type) * did a write before quota was turned on */ rsv = inode_get_rsv_space(inode); - if (unlikely(rsv)) + if (unlikely(rsv)) { + spin_lock(&dq_data_lock); dquot_resv_space(inode->i_dquot[cnt], rsv); + spin_unlock(&dq_data_lock); + } } } out_err: diff --git a/fs/read_write.c b/fs/read_write.c index a698eff457fb..03430008704e 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -9,6 +9,7 @@ #include <linux/fcntl.h> #include <linux/file.h> #include <linux/uio.h> +#include <linux/aio.h> #include <linux/fsnotify.h> #include <linux/security.h> #include <linux/export.h> @@ -16,11 +17,15 @@ #include <linux/pagemap.h> #include <linux/splice.h> #include <linux/compat.h> -#include "read_write.h" +#include "internal.h" #include <asm/uaccess.h> #include <asm/unistd.h> +typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *); +typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *, + unsigned long, loff_t); + const struct file_operations generic_ro_fops = { .llseek = generic_file_llseek, .read = do_sync_read, @@ -127,7 +132,7 @@ EXPORT_SYMBOL(generic_file_llseek_size); * * This is a generic implemenation of ->llseek useable for all normal local * filesystems. It just updates the file offset to the value specified by - * @offset and @whence under i_mutex. + * @offset and @whence. */ loff_t generic_file_llseek(struct file *file, loff_t offset, int whence) { @@ -325,16 +330,6 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count return count > MAX_RW_COUNT ? MAX_RW_COUNT : count; } -static void wait_on_retry_sync_kiocb(struct kiocb *iocb) -{ - set_current_state(TASK_UNINTERRUPTIBLE); - if (!kiocbIsKicked(iocb)) - schedule(); - else - kiocbClearKicked(iocb); - __set_current_state(TASK_RUNNING); -} - ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { struct iovec iov = { .iov_base = buf, .iov_len = len }; @@ -346,13 +341,7 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp kiocb.ki_left = len; kiocb.ki_nbytes = len; - for (;;) { - ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); - if (ret != -EIOCBRETRY) - break; - wait_on_retry_sync_kiocb(&kiocb); - } - + ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); if (-EIOCBQUEUED == ret) ret = wait_on_sync_kiocb(&kiocb); *ppos = kiocb.ki_pos; @@ -402,13 +391,7 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof kiocb.ki_left = len; kiocb.ki_nbytes = len; - for (;;) { - ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); - if (ret != -EIOCBRETRY) - break; - wait_on_retry_sync_kiocb(&kiocb); - } - + ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); if (-EIOCBQUEUED == ret) ret = wait_on_sync_kiocb(&kiocb); *ppos = kiocb.ki_pos; @@ -417,6 +400,33 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof EXPORT_SYMBOL(do_sync_write); +ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos) +{ + mm_segment_t old_fs; + const char __user *p; + ssize_t ret; + + if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write)) + return -EINVAL; + + old_fs = get_fs(); + set_fs(get_ds()); + p = (__force const char __user *)buf; + if (count > MAX_RW_COUNT) + count = MAX_RW_COUNT; + if (file->f_op->write) + ret = file->f_op->write(file, p, count, pos); + else + ret = do_sync_write(file, p, count, pos); + set_fs(old_fs); + if (ret > 0) { + fsnotify_modify(file); + add_wchar(current, ret); + } + inc_syscw(current); + return ret; +} + ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { ssize_t ret; @@ -431,6 +441,7 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_ ret = rw_verify_area(WRITE, file, pos, count); if (ret >= 0) { count = ret; + file_start_write(file); if (file->f_op->write) ret = file->f_op->write(file, buf, count, pos); else @@ -440,6 +451,7 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_ add_wchar(current, ret); } inc_syscw(current); + file_end_write(file); } return ret; @@ -487,8 +499,8 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, return ret; } -SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf, - size_t count, loff_t pos) +SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf, + size_t, count, loff_t, pos) { struct fd f; ssize_t ret = -EBADF; @@ -506,17 +518,9 @@ SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf, return ret; } -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_pread64(long fd, long buf, long count, loff_t pos) -{ - return SYSC_pread64((unsigned int) fd, (char __user *) buf, - (size_t) count, pos); -} -SYSCALL_ALIAS(sys_pread64, SyS_pread64); -#endif -SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf, - size_t count, loff_t pos) +SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf, + size_t, count, loff_t, pos) { struct fd f; ssize_t ret = -EBADF; @@ -534,14 +538,6 @@ SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf, return ret; } -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_pwrite64(long fd, long buf, long count, loff_t pos) -{ - return SYSC_pwrite64((unsigned int) fd, (const char __user *) buf, - (size_t) count, pos); -} -SYSCALL_ALIAS(sys_pwrite64, SyS_pwrite64); -#endif /* * Reduce an iovec's length in-place. Return the resulting number of segments @@ -564,7 +560,7 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to) } EXPORT_SYMBOL(iov_shorten); -ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, +static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn) { struct kiocb kiocb; @@ -575,13 +571,7 @@ ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, kiocb.ki_left = len; kiocb.ki_nbytes = len; - for (;;) { - ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos); - if (ret != -EIOCBRETRY) - break; - wait_on_retry_sync_kiocb(&kiocb); - } - + ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos); if (ret == -EIOCBQUEUED) ret = wait_on_sync_kiocb(&kiocb); *ppos = kiocb.ki_pos; @@ -589,7 +579,7 @@ ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, } /* Do it by hand, with file-ops */ -ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov, +static ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov, unsigned long nr_segs, loff_t *ppos, io_fn_t fn) { struct iovec *vector = iov; @@ -731,6 +721,7 @@ static ssize_t do_readv_writev(int type, struct file *file, } else { fn = (io_fn_t)file->f_op->write; fnv = file->f_op->aio_write; + file_start_write(file); } if (fnv) @@ -739,6 +730,9 @@ static ssize_t do_readv_writev(int type, struct file *file, else ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn); + if (type != READ) + file_end_write(file); + out: if (iov != iovstack) kfree(iov); @@ -869,8 +863,203 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, return ret; } -ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, - loff_t max) +#ifdef CONFIG_COMPAT + +static ssize_t compat_do_readv_writev(int type, struct file *file, + const struct compat_iovec __user *uvector, + unsigned long nr_segs, loff_t *pos) +{ + compat_ssize_t tot_len; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + ssize_t ret; + io_fn_t fn; + iov_fn_t fnv; + + ret = -EINVAL; + if (!file->f_op) + goto out; + + ret = -EFAULT; + if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) + goto out; + + ret = compat_rw_copy_check_uvector(type, uvector, nr_segs, + UIO_FASTIOV, iovstack, &iov); + if (ret <= 0) + goto out; + + tot_len = ret; + ret = rw_verify_area(type, file, pos, tot_len); + if (ret < 0) + goto out; + + fnv = NULL; + if (type == READ) { + fn = file->f_op->read; + fnv = file->f_op->aio_read; + } else { + fn = (io_fn_t)file->f_op->write; + fnv = file->f_op->aio_write; + file_start_write(file); + } + + if (fnv) + ret = do_sync_readv_writev(file, iov, nr_segs, tot_len, + pos, fnv); + else + ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn); + + if (type != READ) + file_end_write(file); + +out: + if (iov != iovstack) + kfree(iov); + if ((ret + (type == READ)) > 0) { + if (type == READ) + fsnotify_access(file); + else + fsnotify_modify(file); + } + return ret; +} + +static size_t compat_readv(struct file *file, + const struct compat_iovec __user *vec, + unsigned long vlen, loff_t *pos) +{ + ssize_t ret = -EBADF; + + if (!(file->f_mode & FMODE_READ)) + goto out; + + ret = -EINVAL; + if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read)) + goto out; + + ret = compat_do_readv_writev(READ, file, vec, vlen, pos); + +out: + if (ret > 0) + add_rchar(current, ret); + inc_syscr(current); + return ret; +} + +COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd, + const struct compat_iovec __user *,vec, + unsigned long, vlen) +{ + struct fd f = fdget(fd); + ssize_t ret; + loff_t pos; + + if (!f.file) + return -EBADF; + pos = f.file->f_pos; + ret = compat_readv(f.file, vec, vlen, &pos); + f.file->f_pos = pos; + fdput(f); + return ret; +} + +COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, + const struct compat_iovec __user *,vec, + unsigned long, vlen, loff_t, pos) +{ + struct fd f; + ssize_t ret; + + if (pos < 0) + return -EINVAL; + f = fdget(fd); + if (!f.file) + return -EBADF; + ret = -ESPIPE; + if (f.file->f_mode & FMODE_PREAD) + ret = compat_readv(f.file, vec, vlen, &pos); + fdput(f); + return ret; +} + +COMPAT_SYSCALL_DEFINE5(preadv, unsigned long, fd, + const struct compat_iovec __user *,vec, + unsigned long, vlen, u32, pos_low, u32, pos_high) +{ + loff_t pos = ((loff_t)pos_high << 32) | pos_low; + return compat_sys_preadv64(fd, vec, vlen, pos); +} + +static size_t compat_writev(struct file *file, + const struct compat_iovec __user *vec, + unsigned long vlen, loff_t *pos) +{ + ssize_t ret = -EBADF; + + if (!(file->f_mode & FMODE_WRITE)) + goto out; + + ret = -EINVAL; + if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write)) + goto out; + + ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos); + +out: + if (ret > 0) + add_wchar(current, ret); + inc_syscw(current); + return ret; +} + +COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd, + const struct compat_iovec __user *, vec, + unsigned long, vlen) +{ + struct fd f = fdget(fd); + ssize_t ret; + loff_t pos; + + if (!f.file) + return -EBADF; + pos = f.file->f_pos; + ret = compat_writev(f.file, vec, vlen, &pos); + f.file->f_pos = pos; + fdput(f); + return ret; +} + +COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, + const struct compat_iovec __user *,vec, + unsigned long, vlen, loff_t, pos) +{ + struct fd f; + ssize_t ret; + + if (pos < 0) + return -EINVAL; + f = fdget(fd); + if (!f.file) + return -EBADF; + ret = -ESPIPE; + if (f.file->f_mode & FMODE_PWRITE) + ret = compat_writev(f.file, vec, vlen, &pos); + fdput(f); + return ret; +} + +COMPAT_SYSCALL_DEFINE5(pwritev, unsigned long, fd, + const struct compat_iovec __user *,vec, + unsigned long, vlen, u32, pos_low, u32, pos_high) +{ + loff_t pos = ((loff_t)pos_high << 32) | pos_low; + return compat_sys_pwritev64(fd, vec, vlen, pos); +} +#endif + +static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, + size_t count, loff_t max) { struct fd in, out; struct inode *in_inode, *out_inode; @@ -994,3 +1183,43 @@ SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, si return do_sendfile(out_fd, in_fd, NULL, count, 0); } + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, + compat_off_t __user *, offset, compat_size_t, count) +{ + loff_t pos; + off_t off; + ssize_t ret; + + if (offset) { + if (unlikely(get_user(off, offset))) + return -EFAULT; + pos = off; + ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); + if (unlikely(put_user(pos, offset))) + return -EFAULT; + return ret; + } + + return do_sendfile(out_fd, in_fd, NULL, count, 0); +} + +COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, + compat_loff_t __user *, offset, compat_size_t, count) +{ + loff_t pos; + ssize_t ret; + + if (offset) { + if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) + return -EFAULT; + ret = do_sendfile(out_fd, in_fd, &pos, count, 0); + if (unlikely(put_user(pos, offset))) + return -EFAULT; + return ret; + } + + return do_sendfile(out_fd, in_fd, NULL, count, 0); +} +#endif diff --git a/fs/read_write.h b/fs/read_write.h deleted file mode 100644 index d3e00ef67420..000000000000 --- a/fs/read_write.h +++ /dev/null @@ -1,16 +0,0 @@ -/* - * This file is only for sharing some helpers from read_write.c with compat.c. - * Don't use anywhere else. - */ - - -typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *); -typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *, - unsigned long, loff_t); - -ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, - unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn); -ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov, - unsigned long nr_segs, loff_t *ppos, io_fn_t fn); -ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, - loff_t max); diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 6165bd4784f6..dcaafcfc23b0 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -234,68 +234,9 @@ int reiserfs_commit_page(struct inode *inode, struct page *page, return ret; } -/* Write @count bytes at position @ppos in a file indicated by @file - from the buffer @buf. - - generic_file_write() is only appropriate for filesystems that are not seeking to optimize performance and want - something simple that works. It is not for serious use by general purpose filesystems, excepting the one that it was - written for (ext2/3). This is for several reasons: - - * It has no understanding of any filesystem specific optimizations. - - * It enters the filesystem repeatedly for each page that is written. - - * It depends on reiserfs_get_block() function which if implemented by reiserfs performs costly search_by_key - * operation for each page it is supplied with. By contrast reiserfs_file_write() feeds as much as possible at a time - * to reiserfs which allows for fewer tree traversals. - - * Each indirect pointer insertion takes a lot of cpu, because it involves memory moves inside of blocks. - - * Asking the block allocation code for blocks one at a time is slightly less efficient. - - All of these reasons for not using only generic file write were understood back when reiserfs was first miscoded to - use it, but we were in a hurry to make code freeze, and so it couldn't be revised then. This new code should make - things right finally. - - Future Features: providing search_by_key with hints. - -*/ -static ssize_t reiserfs_file_write(struct file *file, /* the file we are going to write into */ - const char __user * buf, /* pointer to user supplied data - (in userspace) */ - size_t count, /* amount of bytes to write */ - loff_t * ppos /* pointer to position in file that we start writing at. Should be updated to - * new current position before returning. */ - ) -{ - struct inode *inode = file_inode(file); // Inode of the file that we are writing to. - /* To simplify coding at this time, we store - locked pages in array for now */ - struct reiserfs_transaction_handle th; - th.t_trans_id = 0; - - /* If a filesystem is converted from 3.5 to 3.6, we'll have v3.5 items - * lying around (most of the disk, in fact). Despite the filesystem - * now being a v3.6 format, the old items still can't support large - * file sizes. Catch this case here, as the rest of the VFS layer is - * oblivious to the different limitations between old and new items. - * reiserfs_setattr catches this for truncates. This chunk is lifted - * from generic_write_checks. */ - if (get_inode_item_key_version (inode) == KEY_FORMAT_3_5 && - *ppos + count > MAX_NON_LFS) { - if (*ppos >= MAX_NON_LFS) { - return -EFBIG; - } - if (count > MAX_NON_LFS - (unsigned long)*ppos) - count = MAX_NON_LFS - (unsigned long)*ppos; - } - - return do_sync_write(file, buf, count, ppos); -} - const struct file_operations reiserfs_file_operations = { .read = do_sync_read, - .write = reiserfs_file_write, + .write = do_sync_write, .unlocked_ioctl = reiserfs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = reiserfs_compat_ioctl, diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index ea5061fd4f3e..77d6d47abc83 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -18,6 +18,7 @@ #include <linux/writeback.h> #include <linux/quotaops.h> #include <linux/swap.h> +#include <linux/aio.h> int reiserfs_commit_write(struct file *f, struct page *page, unsigned from, unsigned to); diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index afcadcc03e8a..742fdd4c209a 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -97,7 +97,7 @@ static int flush_commit_list(struct super_block *s, static int can_dirty(struct reiserfs_journal_cnode *cn); static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *sb, unsigned long nblocks); -static int release_journal_dev(struct super_block *super, +static void release_journal_dev(struct super_block *super, struct reiserfs_journal *journal); static int dirty_one_transaction(struct super_block *s, struct reiserfs_journal_list *jl); @@ -2532,23 +2532,13 @@ static void journal_list_init(struct super_block *sb) SB_JOURNAL(sb)->j_current_jl = alloc_journal_list(sb); } -static int release_journal_dev(struct super_block *super, +static void release_journal_dev(struct super_block *super, struct reiserfs_journal *journal) { - int result; - - result = 0; - if (journal->j_dev_bd != NULL) { - result = blkdev_put(journal->j_dev_bd, journal->j_dev_mode); + blkdev_put(journal->j_dev_bd, journal->j_dev_mode); journal->j_dev_bd = NULL; } - - if (result != 0) { - reiserfs_warning(super, "sh-457", - "Cannot release journal device: %i", result); - } - return result; } static int journal_init_dev(struct super_block *super, diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index 9cc0740adffa..33532f79b4f7 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -394,20 +394,24 @@ static int set_sb(struct super_block *sb, void *data) return -ENOENT; } +struct reiserfs_seq_private { + struct super_block *sb; + int (*show) (struct seq_file *, struct super_block *); +}; + static void *r_start(struct seq_file *m, loff_t * pos) { - struct proc_dir_entry *de = m->private; - struct super_block *s = de->parent->data; + struct reiserfs_seq_private *priv = m->private; loff_t l = *pos; if (l) return NULL; - if (IS_ERR(sget(&reiserfs_fs_type, test_sb, set_sb, 0, s))) + if (IS_ERR(sget(&reiserfs_fs_type, test_sb, set_sb, 0, priv->sb))) return NULL; - up_write(&s->s_umount); - return s; + up_write(&priv->sb->s_umount); + return priv->sb; } static void *r_next(struct seq_file *m, void *v, loff_t * pos) @@ -426,9 +430,8 @@ static void r_stop(struct seq_file *m, void *v) static int r_show(struct seq_file *m, void *v) { - struct proc_dir_entry *de = m->private; - int (*show) (struct seq_file *, struct super_block *) = de->data; - return show(m, v); + struct reiserfs_seq_private *priv = m->private; + return priv->show(m, v); } static const struct seq_operations r_ops = { @@ -440,11 +443,15 @@ static const struct seq_operations r_ops = { static int r_open(struct inode *inode, struct file *file) { - int ret = seq_open(file, &r_ops); + struct reiserfs_seq_private *priv; + int ret = seq_open_private(file, &r_ops, + sizeof(struct reiserfs_seq_private)); if (!ret) { struct seq_file *m = file->private_data; - m->private = PDE(inode); + priv = m->private; + priv->sb = proc_get_parent_data(inode); + priv->show = PDE_DATA(inode); } return ret; } @@ -453,7 +460,7 @@ static const struct file_operations r_file_operations = { .open = r_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_private, .owner = THIS_MODULE, }; @@ -479,9 +486,8 @@ int reiserfs_proc_info_init(struct super_block *sb) *s = '!'; spin_lock_init(&__PINFO(sb).lock); - REISERFS_SB(sb)->procdir = proc_mkdir(b, proc_info_root); + REISERFS_SB(sb)->procdir = proc_mkdir_data(b, 0, proc_info_root, sb); if (REISERFS_SB(sb)->procdir) { - REISERFS_SB(sb)->procdir->data = sb; add_file(sb, "version", show_version); add_file(sb, "super", show_super); add_file(sb, "per-level", show_per_level); @@ -499,29 +505,17 @@ int reiserfs_proc_info_init(struct super_block *sb) int reiserfs_proc_info_done(struct super_block *sb) { struct proc_dir_entry *de = REISERFS_SB(sb)->procdir; - char b[BDEVNAME_SIZE]; - char *s; + if (de) { + char b[BDEVNAME_SIZE]; + char *s; - /* Some block devices use /'s */ - strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE); - s = strchr(b, '/'); - if (s) - *s = '!'; + /* Some block devices use /'s */ + strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE); + s = strchr(b, '/'); + if (s) + *s = '!'; - if (de) { - remove_proc_entry("journal", de); - remove_proc_entry("oidmap", de); - remove_proc_entry("on-disk-super", de); - remove_proc_entry("bitmap", de); - remove_proc_entry("per-level", de); - remove_proc_entry("super", de); - remove_proc_entry("version", de); - } - spin_lock(&__PINFO(sb).lock); - __PINFO(sb).exiting = 1; - spin_unlock(&__PINFO(sb).lock); - if (proc_info_root) { - remove_proc_entry(b, proc_info_root); + remove_proc_subtree(b, proc_info_root); REISERFS_SB(sb)->procdir = NULL; } return 0; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 418bdc3a57da..f8a23c3078f8 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1147,8 +1147,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin "on filesystem root."); return 0; } - qf_names[qtype] = - kmalloc(strlen(arg) + 1, GFP_KERNEL); + qf_names[qtype] = kstrdup(arg, GFP_KERNEL); if (!qf_names[qtype]) { reiserfs_warning(s, "reiserfs-2502", "not enough memory " @@ -1156,7 +1155,6 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin "quotafile name."); return 0; } - strcpy(qf_names[qtype], arg); if (qtype == USRQUOTA) *mount_options |= 1 << REISERFS_USRQUOTA; else @@ -2434,6 +2432,7 @@ struct file_system_type reiserfs_fs_type = { .kill_sb = reiserfs_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("reiserfs"); MODULE_DESCRIPTION("ReiserFS journaled filesystem"); MODULE_AUTHOR("Hans Reiser <reiser@namesys.com>"); diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index c196369fe408..4cce1d9552fb 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -187,8 +187,8 @@ fill_with_dentries(void *buf, const char *name, int namelen, loff_t offset, if (dbuf->count == ARRAY_SIZE(dbuf->dentries)) return -ENOSPC; - if (name[0] == '.' && (name[1] == '\0' || - (name[1] == '.' && name[2] == '\0'))) + if (name[0] == '.' && (namelen < 2 || + (namelen == 2 && name[1] == '.'))) return 0; dentry = lookup_one_len(name, dbuf->xadir, namelen); diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 7e8d3a80bdab..15cbc41ee365 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -599,6 +599,7 @@ static struct file_system_type romfs_fs_type = { .kill_sb = romfs_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("romfs"); /* * inode storage initialiser diff --git a/fs/seq_file.c b/fs/seq_file.c index 38bb59f3f2ad..774c1eb7f1c9 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -599,6 +599,24 @@ int single_open(struct file *file, int (*show)(struct seq_file *, void *), } EXPORT_SYMBOL(single_open); +int single_open_size(struct file *file, int (*show)(struct seq_file *, void *), + void *data, size_t size) +{ + char *buf = kmalloc(size, GFP_KERNEL); + int ret; + if (!buf) + return -ENOMEM; + ret = single_open(file, show, data); + if (ret) { + kfree(buf); + return ret; + } + ((struct seq_file *)file->private_data)->buf = buf; + ((struct seq_file *)file->private_data)->size = size; + return 0; +} +EXPORT_SYMBOL(single_open_size); + int single_release(struct inode *inode, struct file *file) { const struct seq_operations *op = ((struct seq_file *)file->private_data)->op; diff --git a/fs/signalfd.c b/fs/signalfd.c index b53486961735..424b7b65321f 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -30,6 +30,7 @@ #include <linux/signalfd.h> #include <linux/syscalls.h> #include <linux/proc_fs.h> +#include <linux/compat.h> void signalfd_cleanup(struct sighand_struct *sighand) { @@ -311,3 +312,33 @@ SYSCALL_DEFINE3(signalfd, int, ufd, sigset_t __user *, user_mask, { return sys_signalfd4(ufd, user_mask, sizemask, 0); } + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(signalfd4, int, ufd, + const compat_sigset_t __user *,sigmask, + compat_size_t, sigsetsize, + int, flags) +{ + compat_sigset_t ss32; + sigset_t tmp; + sigset_t __user *ksigmask; + + if (sigsetsize != sizeof(compat_sigset_t)) + return -EINVAL; + if (copy_from_user(&ss32, sigmask, sizeof(ss32))) + return -EFAULT; + sigset_from_compat(&tmp, &ss32); + ksigmask = compat_alloc_user_space(sizeof(sigset_t)); + if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t))) + return -EFAULT; + + return sys_signalfd4(ufd, ksigmask, sizeof(sigset_t), flags); +} + +COMPAT_SYSCALL_DEFINE3(signalfd, int, ufd, + const compat_sigset_t __user *,sigmask, + compat_size_t, sigsetsize) +{ + return compat_sys_signalfd4(ufd, sigmask, sigsetsize, 0); +} +#endif diff --git a/fs/splice.c b/fs/splice.c index 718bd0056384..e6b25598c8c4 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -31,6 +31,8 @@ #include <linux/security.h> #include <linux/gfp.h> #include <linux/socket.h> +#include <linux/compat.h> +#include "internal.h" /* * Attempt to steal a page from a pipe buffer. This should perhaps go into @@ -217,7 +219,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, page_nr++; ret += buf->len; - if (pipe->inode) + if (pipe->files) do_wakeup = 1; if (!--spd->nr_pages) @@ -827,7 +829,7 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, ops->release(pipe, buf); pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); pipe->nrbufs--; - if (pipe->inode) + if (pipe->files) sd->need_wakeup = true; } @@ -999,8 +1001,6 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, }; ssize_t ret; - sb_start_write(inode->i_sb); - pipe_lock(pipe); splice_from_pipe_begin(&sd); @@ -1036,7 +1036,6 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, *ppos += ret; balance_dirty_pages_ratelimited(mapping); } - sb_end_write(inode->i_sb); return ret; } @@ -1048,9 +1047,10 @@ static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf, { int ret; void *data; + loff_t tmp = sd->pos; data = buf->ops->map(pipe, buf, 0); - ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos); + ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp); buf->ops->unmap(pipe, buf, data); return ret; @@ -1115,7 +1115,10 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, else splice_write = default_file_splice_write; - return splice_write(pipe, out, ppos, len, flags); + file_start_write(out); + ret = splice_write(pipe, out, ppos, len, flags); + file_end_write(out); + return ret; } /* @@ -1181,7 +1184,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, */ pipe = current->splice_pipe; if (unlikely(!pipe)) { - pipe = alloc_pipe_info(NULL); + pipe = alloc_pipe_info(); if (!pipe) return -ENOMEM; @@ -1688,6 +1691,27 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov, return error; } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32, + unsigned int, nr_segs, unsigned int, flags) +{ + unsigned i; + struct iovec __user *iov; + if (nr_segs > UIO_MAXIOV) + return -EINVAL; + iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec)); + for (i = 0; i < nr_segs; i++) { + struct compat_iovec v; + if (get_user(v.iov_base, &iov32[i].iov_base) || + get_user(v.iov_len, &iov32[i].iov_len) || + put_user(compat_ptr(v.iov_base), &iov[i].iov_base) || + put_user(v.iov_len, &iov[i].iov_len)) + return -EFAULT; + } + return sys_vmsplice(fd, iov, nr_segs, flags); +} +#endif + SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags) diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 260e3928d4f5..60553a9053ca 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -489,6 +489,7 @@ static struct file_system_type squashfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV }; +MODULE_ALIAS_FS("squashfs"); static const struct super_operations squashfs_super_ops = { .alloc_inode = squashfs_alloc_inode, diff --git a/fs/sync.c b/fs/sync.c index 2c5d6639a66a..905f3f6b3d85 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -283,8 +283,8 @@ EXPORT_SYMBOL(generic_write_sync); * already-instantiated disk blocks, there are no guarantees here that the data * will be available after a crash. */ -SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes, - unsigned int flags) +SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes, + unsigned int, flags) { int ret; struct fd f; @@ -365,29 +365,11 @@ out_put: out: return ret; } -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_sync_file_range(long fd, loff_t offset, loff_t nbytes, - long flags) -{ - return SYSC_sync_file_range((int) fd, offset, nbytes, - (unsigned int) flags); -} -SYSCALL_ALIAS(sys_sync_file_range, SyS_sync_file_range); -#endif /* It would be nice if people remember that not all the world's an i386 when they introduce new system calls */ -SYSCALL_DEFINE(sync_file_range2)(int fd, unsigned int flags, - loff_t offset, loff_t nbytes) +SYSCALL_DEFINE4(sync_file_range2, int, fd, unsigned int, flags, + loff_t, offset, loff_t, nbytes) { return sys_sync_file_range(fd, offset, nbytes, flags); } -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_sync_file_range2(long fd, long flags, - loff_t offset, loff_t nbytes) -{ - return SYSC_sync_file_range2((int) fd, (unsigned int) flags, - offset, nbytes); -} -SYSCALL_ALIAS(sys_sync_file_range2, SyS_sync_file_range2); -#endif diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 2fbdff6be25c..e8e0e71b29d5 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -165,21 +165,8 @@ struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd) if (unlikely(!sd)) return NULL; - while (1) { - int v, t; - - v = atomic_read(&sd->s_active); - if (unlikely(v < 0)) - return NULL; - - t = atomic_cmpxchg(&sd->s_active, v, v + 1); - if (likely(t == v)) - break; - if (t < 0) - return NULL; - - cpu_relax(); - } + if (!atomic_inc_unless_negative(&sd->s_active)) + return NULL; if (likely(!ignore_lockdep(sd))) rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_); @@ -281,6 +268,10 @@ void release_sysfs_dirent(struct sysfs_dirent * sd) */ parent_sd = sd->s_parent; + WARN(!(sd->s_flags & SYSFS_FLAG_REMOVED), + "sysfs: free using entry: %s/%s\n", + parent_sd ? parent_sd->s_name : "", sd->s_name); + if (sysfs_type(sd) == SYSFS_KOBJ_LINK) sysfs_put(sd->s_symlink.target_sd); if (sysfs_type(sd) & SYSFS_COPY_NAME) @@ -399,7 +390,7 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type) sd->s_name = name; sd->s_mode = mode; - sd->s_flags = type; + sd->s_flags = type | SYSFS_FLAG_REMOVED; return sd; @@ -479,6 +470,9 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME; } + /* Mark the entry added into directory tree */ + sd->s_flags &= ~SYSFS_FLAG_REMOVED; + return 0; } @@ -1012,6 +1006,7 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) enum kobj_ns_type type; const void *ns; ino_t ino; + loff_t off; type = sysfs_ns_type(parent_sd); ns = sysfs_info(dentry->d_sb)->ns[type]; @@ -1020,6 +1015,8 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) ino = parent_sd->s_ino; if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0) filp->f_pos++; + else + return 0; } if (filp->f_pos == 1) { if (parent_sd->s_parent) @@ -1028,8 +1025,11 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) ino = parent_sd->s_ino; if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0) filp->f_pos++; + else + return 0; } mutex_lock(&sysfs_mutex); + off = filp->f_pos; for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos); pos; pos = sysfs_dir_next_pos(ns, parent_sd, filp->f_pos, pos)) { @@ -1041,27 +1041,43 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) len = strlen(name); ino = pos->s_ino; type = dt_type(pos); - filp->f_pos = pos->s_hash; + off = filp->f_pos = pos->s_hash; filp->private_data = sysfs_get(pos); mutex_unlock(&sysfs_mutex); - ret = filldir(dirent, name, len, filp->f_pos, ino, type); + ret = filldir(dirent, name, len, off, ino, type); mutex_lock(&sysfs_mutex); if (ret < 0) break; } mutex_unlock(&sysfs_mutex); - if ((filp->f_pos > 1) && !pos) { /* EOF */ - filp->f_pos = INT_MAX; + + /* don't reference last entry if its refcount is dropped */ + if (!pos) { filp->private_data = NULL; + + /* EOF and not changed as 0 or 1 in read/write path */ + if (off == filp->f_pos && off > 1) + filp->f_pos = INT_MAX; } return 0; } +static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file_inode(file); + loff_t ret; + + mutex_lock(&inode->i_mutex); + ret = generic_file_llseek(file, offset, whence); + mutex_unlock(&inode->i_mutex); + + return ret; +} const struct file_operations sysfs_dir_operations = { .read = generic_read_dir, .readdir = sysfs_readdir, .release = sysfs_dir_release, - .llseek = generic_file_llseek, + .llseek = sysfs_dir_llseek, }; diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 8d924b5ec733..afd83273e6ce 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -19,6 +19,7 @@ #include <linux/module.h> #include <linux/magic.h> #include <linux/slab.h> +#include <linux/user_namespace.h> #include "sysfs.h" @@ -111,6 +112,9 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, struct super_block *sb; int error; + if (!(flags & MS_KERNMOUNT) && !current_user_ns()->may_mount_sysfs) + return ERR_PTR(-EPERM); + info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return ERR_PTR(-ENOMEM); diff --git a/fs/sysv/super.c b/fs/sysv/super.c index a38e87bdd78d..d0c6a007ce83 100644 --- a/fs/sysv/super.c +++ b/fs/sysv/super.c @@ -545,6 +545,7 @@ static struct file_system_type sysv_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("sysv"); static struct file_system_type v7_fs_type = { .owner = THIS_MODULE, @@ -553,6 +554,8 @@ static struct file_system_type v7_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("v7"); +MODULE_ALIAS("v7"); static int __init init_sysv_fs(void) { @@ -586,5 +589,4 @@ static void __exit exit_sysv_fs(void) module_init(init_sysv_fs) module_exit(exit_sysv_fs) -MODULE_ALIAS("v7"); MODULE_LICENSE("GPL"); diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index f12189d2db1d..14374530784c 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -50,6 +50,7 @@ */ #include "ubifs.h" +#include <linux/aio.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/slab.h> diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index ddc0f6ae65e9..f21acf0ef01f 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1568,6 +1568,12 @@ static int ubifs_remount_rw(struct ubifs_info *c) c->remounting_rw = 1; c->ro_mount = 0; + if (c->space_fixup) { + err = ubifs_fixup_free_space(c); + if (err) + return err; + } + err = check_free_space(c); if (err) goto out; @@ -1684,12 +1690,6 @@ static int ubifs_remount_rw(struct ubifs_info *c) err = dbg_check_space_info(c); } - if (c->space_fixup) { - err = ubifs_fixup_free_space(c); - if (err) - goto out; - } - mutex_unlock(&c->umount_mutex); return err; @@ -2174,6 +2174,7 @@ static struct file_system_type ubifs_fs_type = { .mount = ubifs_mount, .kill_sb = kill_ubifs_super, }; +MODULE_ALIAS_FS("ubifs"); /* * Inode slab cache constructor. diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 7a12e48ad819..b6d15d349810 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -38,6 +38,7 @@ #include <linux/slab.h> #include <linux/crc-itu-t.h> #include <linux/mpage.h> +#include <linux/aio.h> #include "udf_i.h" #include "udf_sb.h" diff --git a/fs/udf/super.c b/fs/udf/super.c index bc5b30a819e8..9ac4057a86c9 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -118,6 +118,7 @@ static struct file_system_type udf_fstype = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("udf"); static struct kmem_cache *udf_inode_cachep; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index dc8e3a861d0f..329f2f53b7ed 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1500,6 +1500,7 @@ static struct file_system_type ufs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("ufs"); static int __init init_ufs_fs(void) { diff --git a/fs/ufs/util.c b/fs/ufs/util.c index 95425b59ce0a..b6c2f94e041e 100644 --- a/fs/ufs/util.c +++ b/fs/ufs/util.c @@ -26,8 +26,7 @@ struct ufs_buffer_head * _ubh_bread_ (struct ufs_sb_private_info * uspi, count = size >> uspi->s_fshift; if (count > UFS_MAXFRAG) return NULL; - ubh = (struct ufs_buffer_head *) - kmalloc (sizeof (struct ufs_buffer_head), GFP_NOFS); + ubh = kmalloc (sizeof (struct ufs_buffer_head), GFP_NOFS); if (!ubh) return NULL; ubh->fragment = fragment; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 3244c988d379..2b2691b73428 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -31,6 +31,7 @@ #include "xfs_vnodeops.h" #include "xfs_trace.h" #include "xfs_bmap.h" +#include <linux/aio.h> #include <linux/gfp.h> #include <linux/mpage.h> #include <linux/pagevec.h> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index cf6eacd4169a..a5f2042aec8b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -36,6 +36,7 @@ #include "xfs_ioctl.h" #include "xfs_trace.h" +#include <linux/aio.h> #include <linux/dcache.h> #include <linux/falloc.h> #include <linux/pagevec.h> @@ -775,8 +776,6 @@ xfs_file_aio_write( if (ocount == 0) return 0; - sb_start_write(inode->i_sb); - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { ret = -EIO; goto out; @@ -800,7 +799,6 @@ xfs_file_aio_write( } out: - sb_end_write(inode->i_sb); return ret; } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index c407121873b4..ea341cea68cb 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1561,6 +1561,7 @@ static struct file_system_type xfs_fs_type = { .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS_FS("xfs"); STATIC int __init xfs_init_zones(void) |